Google Account
Angela Ogbonna
angelaekele20@gmail.com
Commands Code Text
Notebook
Code Text

Gemini
import pandas as pd
Code Text

Gemini

MY PROJECT

Code Text

Gemini
# naming the data sources
url1 = "https://raw.githubusercontent.com/Oyeniran20/axia_cohort_8/refs/heads/main/trainperf.csv"
url2 = "https://raw.githubusercontent.com/Oyeniran20/axia_cohort_8/refs/heads/main/traindemographics.csv"
url3 = "https://raw.githubusercontent.com/Oyeniran20/axia_cohort_8/refs/heads/main/trainprevloans.csv"
Code Text

Gemini
# import all the necessary libraries
!pip install xgboost --quiet
!pip install catboost --quiet
!pip install shap --quiet 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import time
import shap
import joblib
pd.set_option('display.max_rows'None)
pd.set_option('display.float_format''{:.2f}'.format)

from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold # feature selector
# Import the Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier 
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
# import pipelines
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline  # important: from imblearn, not sklearn
from imblearn.over_sampling import SMOTE
# import metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.2/99.2 MB 7.3 MB/s eta 0:00:00
Code Text

Gemini

Customers Performance Dataset

Code Text

Gemini
# importing the performance dataset
df_perf = pd.read_csv(url1)
df_perf.head()
Code Text

Gemini
# Convert datetime columns to proper datetime format
df_perf['approveddate'] = pd.to_datetime(df_perf['approveddate'])
df_perf['creationdate'] = pd.to_datetime(df_perf['creationdate'])

# Extract just the time portion and create new columns
df_perf['approved_time'] = df_perf['approveddate'].dt.time
df_perf['creation_time'] = df_perf['creationdate'].dt.time

# Check the results
print("Original columns with date and time:")
print(df_perf[['approveddate''creationdate']].head())

Original columns with date and time:
         approveddate        creationdate
0 2017-07-25 08:22:56 2017-07-25 07:22:47
1 2017-07-05 17:04:41 2017-07-05 16:04:18
2 2017-07-06 14:52:57 2017-07-06 13:52:51
3 2017-07-27 19:00:41 2017-07-27 18:00:35
4 2017-07-03 23:42:45 2017-07-03 22:42:39
Code Text

Gemini
# Remove time portion from the original datetime columns (keep only date)
df_perf['approveddate'] = df_perf['approveddate'].dt.date
df_perf['creationdate'] = df_perf['creationdate'].dt.date

# Check the results
print("Updated columns - dates only:")
print(df_perf[['approveddate''creationdate']].head())

print("\nTime columns we created earlier:")
print(df_perf[['approved_time''creation_time']].head())

# View the complete result
df_perf[['approveddate''approved_time''creationdate''creation_time']].head()
Code Text

Gemini
df_perf.head()
Code Text

Gemini

Customers Demographics Dataset

Code Text

Gemini
# importing the demographics dataset
df_demo = pd.read_csv(url2)
df_demo.head()
Code Text

Gemini
# First make sure birthdate is in datetime format
df_demo['birthdate'] = pd.to_datetime(df_demo['birthdate'])

# Create a new column for birth year
df_demo['birth_year'] = df_demo['birthdate'].dt.year

# Now convert birthdate back to date only (remove time portion)
df_demo['birthdate'] = df_demo['birthdate'].dt.date

# Check the results
print("Birthdate with extracted year:")
print(df_demo[['customerid''birthdate''birth_year']].head())

# View the updated dataframe
df_demo.head()
Code Text

Gemini

Customers Historical Loan Dataset

Code Text

Gemini
# importing the previous loand dataset
df_prevloans = pd.read_csv(url3)
df_prevloans.head() 
Code Text

Gemini
# Convert all datetime columns to proper datetime format first
datetime_columns = ['approveddate''creationdate''closeddate''firstduedate''firstrepaiddate']

# Step 1: Convert to datetime format
for col in datetime_columns:
    if col in df_prevloans.columns:
        df_prevloans[col] = pd.to_datetime(df_prevloans[col])

# Step 2: Extract time portions into new columns
df_prevloans['approved_time'] = df_prevloans['approveddate'].dt.time
df_prevloans['creation_time'] = df_prevloans['creationdate'].dt.time
df_prevloans['closed_time'] = df_prevloans['closeddate'].dt.time
df_prevloans['firstdue_time'] = df_prevloans['firstduedate'].dt.time
df_prevloans['firstrepaid_time'] = df_prevloans['firstrepaiddate'].dt.time

# Step 3: Convert original columns to date only (remove time)
for col in datetime_columns:
    if col in df_prevloans.columns:
        df_prevloans[col] = df_prevloans[col].dt.date

# Check the results
print("Original date columns (now date only):")
print(df_prevloans[['approveddate''creationdate''closeddate''firstduedate''firstrepaiddate']].head())

print("\nNew time columns:")
print(df_prevloans[['approved_time''creation_time''closed_time''firstdue_time''firstrepaid_time']].head())

# Display updated info
print(f"\nDataset shape: {df_prevloans.shape}")
print("All columns:", df_prevloans.columns.tolist())
Original date columns (now date only):
  approveddate creationdate  closeddate firstduedate firstrepaiddate
0   2016-08-15   2016-08-15  2016-09-01   2016-09-14      2016-09-01
1   2017-04-28   2017-04-28  2017-05-28   2017-05-30      2017-05-26
2   2017-03-05   2017-03-05  2017-04-26   2017-04-04      2017-04-26
3   2017-04-09   2017-04-09  2017-04-24   2017-04-24      2017-04-24
4   2017-06-17   2017-06-17  2017-07-14   2017-07-03      2017-07-14

New time columns:
  approved_time creation_time closed_time firstdue_time firstrepaid_time
0      18:22:40      17:22:32    16:06:48      00:00:00         15:51:43
1      18:39:07      17:38:53    14:44:49      00:00:00         00:00:00
2      10:56:25      09:56:19    22:18:56      00:00:00         22:03:47
3      18:25:55      17:25:42    01:35:52      00:00:00         00:48:43
4      09:29:57      08:29:50    21:18:43      00:00:00         21:08:35

Dataset shape: (18183, 17)
All columns: ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate', 'loanamount', 'totaldue', 'termdays', 'closeddate', 'referredby', 'firstduedate', 'firstrepaiddate', 'approved_time', 'creation_time', 'closed_time', 'firstdue_time', 'firstrepaid_time']
Code Text

Gemini
df_prevloans.head() 
Code Text

Gemini

Customers Performance Dataset(Current Loan)

Code Text

Gemini
Code Text

Gemini
# checking for the size of the data
df_perf.shape
(4368, 12)
Code Text

Gemini
# listing out all the columns for 
df_perf.columns
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'referredby',
       'good_bad_flag', 'approved_time', 'creation_time'],
      dtype='object')
Code Text

Gemini
# checking for the info of the perfomance dataset
df_perf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4368 entries, 0 to 4367
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   customerid     4368 non-null   object 
 1   systemloanid   4368 non-null   int64  
 2   loannumber     4368 non-null   int64  
 3   approveddate   4368 non-null   object 
 4   creationdate   4368 non-null   object 
 5   loanamount     4368 non-null   float64
 6   totaldue       4368 non-null   float64
 7   termdays       4368 non-null   int64  
 8   referredby     587 non-null    object 
 9   good_bad_flag  4368 non-null   object 
 10  approved_time  4368 non-null   object 
 11  creation_time  4368 non-null   object 
dtypes: float64(2), int64(3), object(7)
memory usage: 409.6+ KB
Code Text

Gemini
Observations gotten from the information of the customers performance dataset:
  • We have columns: 'approveddate' and 'creationdate' with a data-type problem
  • We have column referredby with a missing-value problem
Solutions to these observations:
  • we change the datatype of the 'approveddate' and 'creationdate' to datetime
  • we handle the missing value by either dropping the column(i.e., if NaN values > 80%) or filling it appropriately.
Code Text

Gemini
# checking the number of missing values in our dataset
df_perf.isna().sum()
Code Text

Gemini
# checking the percentage of missing values in our dataset
(df_perf.isna().sum().sort_values(ascending=False)/len(df_perf))*100
Code Text

Gemini
# Handle missing referredby (too many missing to be useful)
df_perf.drop('referredby', axis=1, inplace=True
Code Text

Gemini
# confirming if we dropped the referredby column
(df_perf.isna().sum().sort_values(ascending=False)/len(df_perf))*100
Code Text

Gemini
# Convert date columns to datetime
df_perf['approveddate'] = pd.to_datetime(df_perf['approveddate'])
df_perf['creationdate'] = pd.to_datetime(df_perf['creationdate'])
Code Text

Gemini
# veryfying the datatypes
df_perf.dtypes
Code Text

Gemini
np.int64(0)
Code Text

Gemini
np.int64(0)
Code Text

Gemini
# checking if our primary key(customerid) is unique
df_perf['customerid'].nunique()
4368
Code Text

Gemini
# confirming if our dataset is clean
df_perf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4368 entries, 0 to 4367
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   customerid     4368 non-null   object        
 1   systemloanid   4368 non-null   int64         
 2   loannumber     4368 non-null   int64         
 3   approveddate   4368 non-null   datetime64[ns]
 4   creationdate   4368 non-null   datetime64[ns]
 5   loanamount     4368 non-null   float64       
 6   totaldue       4368 non-null   float64       
 7   termdays       4368 non-null   int64         
 8   good_bad_flag  4368 non-null   object        
 9   approved_time  4368 non-null   object        
 10  creation_time  4368 non-null   object        
dtypes: datetime64[ns](2), float64(2), int64(3), object(4)
memory usage: 375.5+ KB
Code Text

Gemini

FEATURE ENGINEERING FOR THE CUSTOMERS PERFORMANCE DATASET(CURRENT LOAN)¶

Code Text

Gemini

DEMOGRAPHICS DATASET

Code Text

Gemini
Code Text

Gemini
# checking for the size of the data
df_demo.shape
(4346, 10)
Code Text

Gemini
# listing out all the columns for 
df_demo.columns
Index(['customerid', 'birthdate', 'bank_account_type', 'longitude_gps',
       'latitude_gps', 'bank_name_clients', 'bank_branch_clients',
       'employment_status_clients', 'level_of_education_clients',
       'birth_year'],
      dtype='object')
Code Text

Gemini
# checking Afor the info of the perfomance dataset
df_demo.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4346 entries, 0 to 4345
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4346 non-null   object 
 1   birthdate                   4346 non-null   object 
 2   bank_account_type           4346 non-null   object 
 3   longitude_gps               4346 non-null   float64
 4   latitude_gps                4346 non-null   float64
 5   bank_name_clients           4346 non-null   object 
 6   bank_branch_clients         51 non-null     object 
 7   employment_status_clients   3698 non-null   object 
 8   level_of_education_clients  587 non-null    object 
 9   birth_year                  4346 non-null   int32  
dtypes: float64(2), int32(1), object(7)
memory usage: 322.7+ KB
Code Text

Gemini
# checking the number of missing values in our dataset
df_demo.isna().sum()
Code Text

Gemini
# checking the percentage of missing values in our dataset
(df_demo.isna().sum().sort_values(ascending=False)/len(df_demo))*100
Code Text

Gemini
# For columns with high missingness (>80%), we consider dropping
df_demo.drop(['bank_branch_clients''level_of_education_clients'], axis=1, inplace=True)
Code Text

Gemini
# lets see the unique values of employment_status_clients
df_demo.employment_status_clients.unique()
array([nan, 'Permanent', 'Student', 'Self-Employed', 'Unemployed',
       'Retired', 'Contract'], dtype=object)
Code Text

Gemini
sns.countplot(df_demo['employment_status_clients'])
Code Text

Gemini

From the plot, we notice that the permanent feature dominates, so therefore, filling with the most-frequent will cause a bais, hence, we will fill with unknown, meaning that 'employment_status_clients' of the 15% customers are unknown.

Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
# converting the birthdate to datetime
df_demo['birthdate'] = pd.to_datetime(df_demo['birthdate'])
Code Text

Gemini
# veryfying the datatypes
df_demo.dtypes
Code Text

Gemini
# checking for duplicates
df_demo.duplicated().sum()
np.int64(12)
Code Text

Gemini
np.int64(12)
Code Text

Gemini
# checking if our primary key(customerid) is unique
df_demo['customerid'].nunique()
4334
Code Text

Gemini
df_demo.shape
(4346, 8)
Code Text

Gemini

We Notice that our primary key here is not unique and has duplicates, so lets drop duplicates

Code Text

Gemini
# dropping the duplicates
df_demo = df_demo.drop_duplicates()
Code Text

Gemini
# confirming if duplicates was dropped
df_demo.duplicated().sum()
np.int64(0)
Code Text

Gemini
# checking the size of the dataset
df_demo.shape
(4334, 8)
Code Text

Gemini

The primary key is now unique

Code Text

Gemini
<class 'pandas.core.frame.DataFrame'>
Index: 4334 entries, 0 to 4345
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   customerid                 4334 non-null   object        
 1   birthdate                  4334 non-null   datetime64[ns]
 2   bank_account_type          4334 non-null   object        
 3   longitude_gps              4334 non-null   float64       
 4   latitude_gps               4334 non-null   float64       
 5   bank_name_clients          4334 non-null   object        
 6   employment_status_clients  4334 non-null   object        
 7   birth_year                 4334 non-null   int32         
dtypes: datetime64[ns](1), float64(2), int32(1), object(4)
memory usage: 287.8+ KB
Code Text

Gemini

Previous Loan Dataset

Code Text

Gemini
# importing the previous loand dataset
df_prevloans = pd.read_csv(url3)
df_prevloans.head()
Code Text

Gemini
# size of the dataset
df_prevloans.shape
(18183, 12)
Code Text

Gemini
# columns of the dataset
df_prevloans.columns
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'closeddate',
       'referredby', 'firstduedate', 'firstrepaiddate'],
      dtype='object')
Code Text

Gemini
# checking the number of missing values in our dataset
df_prevloans.isna().sum()
Code Text

Gemini
# Handle missing referredby (too many missing to be useful)
df_prevloans.drop('referredby', axis=1, inplace=True
Code Text

Gemini
# confirming that we don't have missing values anymore
(df_prevloans.isna().sum().sort_values(ascending=False)/len(df_prevloans))*100
Code Text

Gemini
# Convert all date columns
date_cols = ['approveddate''creationdate''closeddate''firstduedate''firstrepaiddate']
for col in date_cols:
    df_prevloans[col] = pd.to_datetime(df_prevloans[col])
Code Text

Gemini
df_prevloans.dtypes
Code Text

Gemini
# checking for duplicates 
df_prevloans.duplicated().sum()
np.int64(0)
Code Text

Gemini
# checking for duplicates in the primary key
df_prevloans['customerid'].duplicated().sum()
np.int64(13824)
Code Text

Gemini
# checking for number of unique customers
df_prevloans['customerid'].nunique()
4359
Code Text

Gemini
# checking the size of the dataset
df_prevloans.shape
(18183, 11)
Code Text

Gemini

Observation

Now Observe that in our customers previous loan dataset, the customerid is not unique because out of the 18,183 customers, only 4,359 custormerid is unique while having a duplicate of about 13,824(which implies that a customer have many past loans) which contradicts our aim of utilizing customer's behavior and financial data to build a predictive model, enhancing accuracy and efficiency in risk assessment as this will cause a data misalignment if not treated. In other not to contradict the aim of this project, I then need to find a way to make sure that the customerid unique, i.e., making it one customer to one loan(one row one customer). By doing this we are able to utilize the customers behavior and financial data effectively to predict our model and gain proper or efficient risk assessment.

To be able to make our customerid unique in our customers previous loan dataset, We need to "Aggregate" df_prevloans to one row per customer(e.g,. number of loans, total loan amount) so as to enable us merge with both the customers performance dataset and the customers demographics dataset so as to avoid misalignment and also deviating from our aim.

Code Text

Gemini

Aggregation and Feature Engineering of the Customers Previous Loan Dataset

In course of trying to aggregate, i will create some new features with th already existing feature and then aggregate

Code Text

Gemini
# Feature Creation for the Previous Loan dataset

# Repayment Ratio
df_prevloans['repayment_ratio'] = df_prevloans['totaldue'] / df_prevloans['loanamount']

# Loan duration in days (closed - approved)
df_prevloans['duration_days'] = (df_prevloans['closeddate'] - df_prevloans['approveddate']).dt.days

# Time to first repayment
df_prevloans['repay_delay_days'] = (df_prevloans['firstrepaiddate'] - df_prevloans['firstduedate']).dt.days

# Flag for first repayment delay - 1 if payment was late, 0 if on-time/early
df_prevloans['firstrepaid_late'] = (df_prevloans['repay_delay_days'] > 0).astype(int)                   

# Closure or Settlement delay 
df_prevloans['closed_late'] = (df_prevloans['duration_days'] > df_prevloans['termdays']).astype(int)

# interest amount
df_prevloans['interest'] = df_prevloans['totaldue'] - df_prevloans['loanamount']
Code Text

Gemini
# cheking the dataset
df_prevloans.head()
Code Text

Gemini
# aggregating the new features on the customerid using groupby
df_prevloans_agg = df_prevloans.groupby('customerid').agg({
    'systemloanid''count',  # number of past loans
    'loanamount''mean',
    'repay_delay_days''mean',
    'firstrepaid_late''sum',   # total late first repayments
    'closed_late''sum',  # total late closures
    'repayment_ratio''mean',
    'duration_days''mean',
    'interest''mean'
}).reset_index()
Code Text

Gemini
# renameing the columns of the new previous loan dataset
df_prevloans_agg.columns = ['customerid''num_prev_loans''avg_prev_loanamt''avg_repay_delay_days''total_firstrepaid_late''total_closed_late','avg_prev_repayment_ratio'
                          'avg_duration_days''avg_prev_interest']
Code Text

Gemini

After creating the new features, I grouped all rows that belong to the same customer, then aggregate their past loans into a single record per customer which now makes our customer previous loan unique and ready to be merged.

Before I proceed, i would love to give a little description of my aggregated table:

Feature Descriptions for df_prevloans_agg

Column Name Description
customerid Unique identifier for the customer.
num_prev_loans Total number of previous loans the customer has taken.
avg_prev_loanamt Average loan amount from previous loans.
max_prev_loanamt Maximum loan amount from previous loans.
min_prev_loanamt Minimum loan amount from previous loans.
total_firstrepaid_late Total count of previous loans where the first repayment was made after the due date.
total_closed_late Total count of previous loans that were closed later than the planned term.
avg_prev_repayment_ratio Average ratio of total amount repaid to the original loan amount for previous loans.
avg_duration_days Average number of days from loan approval to loan closure for previous loans.
avg_prev_interest_rate Average interest rate of previous loans, calculated as interest per loan amount per term.
avg_prev_interest Average interest amount paid for previous loans.
Code Text

Gemini
# viewing the first 5 
df_prevloans_agg.head()
Code Text

Gemini
# checking the size
df_prevloans_agg.shape
(4359, 9)
Code Text

Gemini
4359
Code Text

Gemini
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4359 entries, 0 to 4358
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customerid                4359 non-null   object 
 1   num_prev_loans            4359 non-null   int64  
 2   avg_prev_loanamt          4359 non-null   float64
 3   avg_repay_delay_days      4359 non-null   float64
 4   total_firstrepaid_late    4359 non-null   int64  
 5   total_closed_late         4359 non-null   int64  
 6   avg_prev_repayment_ratio  4359 non-null   float64
 7   avg_duration_days         4359 non-null   float64
 8   avg_prev_interest         4359 non-null   float64
dtypes: float64(5), int64(3), object(1)
memory usage: 306.6+ KB
Code Text

Gemini
np.int64(0)
Code Text

Gemini
(4368, 11)
Code Text

Gemini
(4334, 8)
Code Text

Gemini
(4359, 9)
Code Text

Gemini
Customers in perf not found in demo: 1099
Code Text

Gemini
                          customerid  systemloanid  loannumber approveddate  \
5   8a8589f35451855401546b0738c42524     301986516           8   2017-07-19   
29  8a858ee55830c4b90158337542ab18a1     301972649           6   2017-07-11   
34  8a858f1955b1c4df0155cd14c5b478ed     302000569           2   2017-07-28   
38  8a858f3d5add42e2015ae0ca6cb66b83     301998400           8   2017-07-27   
40  8a858f4f5511dca201551b73634170b6     301997295           4   2017-07-26   

   creationdate  loanamount  totaldue  termdays good_bad_flag approved_time  \
5    2017-07-19    30000.00  39000.00        60          Good      21:46:24   
29   2017-07-11    10000.00  13000.00        30           Bad      09:28:30   
34   2017-07-28    10000.00  13000.00        30          Good      13:17:13   
38   2017-07-27    30000.00  34500.00        30          Good      09:34:11   
40   2017-07-26    10000.00  13000.00        30          Good      16:15:25   

   creation_time  
5       20:46:18  
29      08:27:20  
34      12:16:00  
38      08:34:05  
40      15:15:18  
Code Text

Gemini

Now i will merge

Code Text

Gemini
Code Text

Gemini
# viewing the size
df_main.shape
(3269, 18)
Code Text

Gemini

What this tells us is that, only 3,269 Customers fully has their Demographic informations fully filled meaning that they fully applied for a loanwhile 1099 didn't have their demographic meaning that they either didn't apply or they never intended to apply.

Code Text

Gemini

Now, before I merge the already merged performance and the demographic with the previous loan, I want to see how many customers that have not collected loan in the past, thereby making them not to have historical data. They can be called new loanees.

Code Text

Gemini
# number of customers without a loan history
set_main = set(df_main['customerid'])
set_prevloans_agg = set(df_prevloans_agg['customerid'])

missing_customers = set_main - set_prevloans_agg
print(f"Customers in main not found in prevloans_agg: {len(missing_customers)}")
Customers in main not found in prevloans_agg: 5
Code Text

Gemini

So we have 5 customers not having a historical data and therefore do not have previous loans

Code Text

Gemini
# let us see the customers
missing_ids = df_main[df_main['customerid'].isin(missing_customers)]
print(missing_ids.head())
                            customerid  systemloanid  loannumber approveddate  \
1     8a85886e54beabf90154c0a29ae757c0     301965204           2   2017-07-05   
229   8a76e7d443e6e97c0143ed0a13cb4f61     301999706           4   2017-07-28   
1675  8a858e4357be1daf0157c96f4c915ef0     302001005           2   2017-07-28   
2201  8a858fda56562f8f01565f928f516cea     301998904           2   2017-07-27   
2872  8a858e69566ae5b801567ac352d84477     301992704           2   2017-07-24   

     creationdate  loanamount  totaldue  termdays good_bad_flag approved_time  \
1      2017-07-05    15000.00  17250.00        30          Good      17:04:41   
229    2017-07-27    30000.00  39000.00        60           Bad      00:12:31   
1675   2017-07-28    10000.00  11500.00        15          Good      17:15:11   
2201   2017-07-27    10000.00  13000.00        30           Bad      14:38:40   
2872   2017-07-24    10000.00  13000.00        30          Good      05:32:11   

     creation_time  birthdate bank_account_type  longitude_gps  latitude_gps  \
1         16:04:18 1985-08-23           Savings           3.89          7.32   
229       23:11:19 1974-02-23           Savings           3.32          6.61   
1675      16:15:03 1981-01-22           Savings           5.23          7.60   
2201      13:37:26 1969-12-29           Savings           3.39          6.46   
2872      04:31:53 1981-05-14           Savings           3.37          7.12   

     bank_name_clients employment_status_clients  birth_year  
1              GT Bank                 Permanent        1985  
229            GT Bank                 Permanent        1974  
1675      Diamond Bank                 Permanent        1981  
2201               UBA                 Permanent        1969  
2872         Wema Bank                 Permanent        1981  
Code Text

Gemini

On merging, we have:

Code Text

Gemini
# merging of all three datasets
df_final = pd.merge(df_main, df_prevloans_agg, on='customerid', how='left')
Code Text

Gemini
# checkng the size of the dataset
df_final.shape
(3269, 26)
Code Text

Gemini
Code Text

Gemini
# checking the informatiion of the dataset
df_final.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   customerid                 3269 non-null   object        
 1   systemloanid               3269 non-null   int64         
 2   loannumber                 3269 non-null   int64         
 3   approveddate               3269 non-null   datetime64[ns]
 4   creationdate               3269 non-null   datetime64[ns]
 5   loanamount                 3269 non-null   float64       
 6   totaldue                   3269 non-null   float64       
 7   termdays                   3269 non-null   int64         
 8   good_bad_flag              3269 non-null   object        
 9   approved_time              3269 non-null   object        
 10  creation_time              3269 non-null   object        
 11  birthdate                  3269 non-null   datetime64[ns]
 12  bank_account_type          3269 non-null   object        
 13  longitude_gps              3269 non-null   float64       
 14  latitude_gps               3269 non-null   float64       
 15  bank_name_clients          3269 non-null   object        
 16  employment_status_clients  3269 non-null   object        
 17  birth_year                 3269 non-null   int32         
 18  num_prev_loans             3264 non-null   float64       
 19  avg_prev_loanamt           3264 non-null   float64       
 20  avg_repay_delay_days       3264 non-null   float64       
 21  total_firstrepaid_late     3264 non-null   float64       
 22  total_closed_late          3264 non-null   float64       
 23  avg_prev_repayment_ratio   3264 non-null   float64       
 24  avg_duration_days          3264 non-null   float64       
 25  avg_prev_interest          3264 non-null   float64       
dtypes: datetime64[ns](3), float64(12), int32(1), int64(3), object(7)
memory usage: 651.4+ KB
Code Text

Gemini

As expected, we had missing values in the prevloans_agg dataset that was merged, and that was simply because we used a left join and left join attached the dataset to the left side of the former. And that was why we noticed that we had 5 customers that didn't have any loan history but are new loanees. Now lets check for the missing value and deal with it

Code Text

Gemini
# checking for number missing values
df_final.isna().sum()
Code Text

Gemini
Code Text

Gemini

Now to deal with this missing value problem, I will fill with default values such as:

  • 0 for counts → model understands “this customer has never done this before.”
  • 0 or -1 for ratios → lets the model differentiate between “value is actually 0” and “no history.”
  • “no_history” for categories → model can learn if lack of history is a risk signal.
Code Text

Gemini
# filling missing value with 0
df_final.fillna({
    'avg_prev_interest_rate'0,
    'total_firstrepaid_late'0,
    'total_closed_late'0,
    'avg_prev_repayment_ratio'0# -1 means "no repayment history" 
    'avg_duration_days'0# 0 means no lateness - or no history
    'num_prev_loans'0,
    'avg_repay_delay_days'0,
    'avg_prev_loanamt'0,
    'avg_prev_interest'0
}, inplace=True)
Code Text

Gemini
# checking for percentage missing values
(df_final.isna().sum().sort_values(ascending=False)/len(df_final))*100
Code Text

Gemini
Code Text

Gemini
(3269, 26)
Code Text

Gemini
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'good_bad_flag',
       'approved_time', 'creation_time', 'birthdate', 'bank_account_type',
       'longitude_gps', 'latitude_gps', 'bank_name_clients',
       'employment_status_clients', 'birth_year', 'num_prev_loans',
       'avg_prev_loanamt', 'avg_repay_delay_days', 'total_firstrepaid_late',
       'total_closed_late', 'avg_prev_repayment_ratio', 'avg_duration_days',
       'avg_prev_interest'],
      dtype='object')
Code Text

Gemini

At this stage, I will utilize the latitude and longitude coordinates in the dataset to determine the geographic locations of all data points. Rather than restricting the analysis to a specific country, I will consider all points irrespective of their geographic boundaries. This approach involves mapping each coordinate globally and extracting the corresponding place names (such as city, state, and country) to enrich the dataset with meaningful location information. This will enhance the spatial analysis and visualization by providing clear, contextual geographic references for all data entries.

Plotting the various locations on a world is seen below:


Gemini
Code Text

Gemini
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'good_bad_flag',
       'approved_time', 'creation_time', 'birthdate', 'bank_account_type',
       'longitude_gps', 'latitude_gps', 'bank_name_clients',
       'employment_status_clients', 'birth_year', 'num_prev_loans',
       'avg_prev_loanamt', 'avg_repay_delay_days', 'total_firstrepaid_late',
       'total_closed_late', 'avg_prev_repayment_ratio', 'avg_duration_days',
       'avg_prev_interest'],
      dtype='object')
Code Text

Gemini

FEATURE ENGINEERING OF THE df_loan MERGED DATASET

Code Text

Gemini

I will move forward to engineering few features from the total merged datasets of the customer's performance, demographic and previous loan dataset to aid our prediction using our customers behavior and finanacial data.

First, I will engineer the age using the creation date and the birthdate to get the various customers age after which i will then drop the birthdate as it will no longer be useful for our analysis and prediction. Next, I will engineer the age-group of the various customers to distiguish between the young adults, adults, middle-aged adults and old adults to discover the age group that defaults the most and so on.

Code Text

Gemini
Code Text

Gemini
array([45, 31, 32, 39, 30, 28, 29, 34, 52, 40, 38, 35, 41, 23, 24, 33, 26,
       36, 42, 55, 46, 47, 37, 25, 21, 43, 22, 50, 27, 51, 44, 49, 48, 53,
       54])
Code Text

Gemini
Minimum age: 21
Maximum age: 55
Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'good_bad_flag',
       'approved_time', 'creation_time', 'bank_account_type', 'longitude_gps',
       'latitude_gps', 'bank_name_clients', 'employment_status_clients',
       'birth_year', 'num_prev_loans', 'avg_prev_loanamt',
       'avg_repay_delay_days', 'total_firstrepaid_late', 'total_closed_late',
       'avg_prev_repayment_ratio', 'avg_duration_days', 'avg_prev_interest',
       'age'],
      dtype='object')
Code Text

Gemini
(3269, 26)
Code Text

Gemini

### EXPLORATORY DATA ANALYSIS OF THE MERGED DATASET df_loan

Code Text

Gemini

Exploring the Merged Dataset

This merged dataset provides a complete view of each customer’s loan history, personal details, and current loan performance, enabling us to uncover patterns and relationships that may influence loan repayment behavior.

With the data fully cleaned and prepared, we can now proceed to explore it, analyze distributions, detect trends, and identify potential predictors for our risk assessment model.

Code Text

Gemini
Code Text

Gemini
(3269, 26)
Code Text

Gemini

Even though the merged dataset is now clean and ready for modeling, I still want to perform further checks and exploratory analysis — because you never know what might have slipped through during data processing. It’s always a good idea to double-check for anomalies, outliers, or unexpected patterns before moving forward.

Code Text

Gemini
Code Text

Gemini
np.int64(0)
Code Text

Gemini
Code Text

Gemini
Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'good_bad_flag',
       'approved_time', 'creation_time', 'bank_account_type', 'longitude_gps',
       'latitude_gps', 'bank_name_clients', 'employment_status_clients',
       'birth_year', 'num_prev_loans', 'avg_prev_loanamt',
       'avg_repay_delay_days', 'total_firstrepaid_late', 'total_closed_late',
       'avg_prev_repayment_ratio', 'avg_duration_days', 'avg_prev_interest',
       'age'],
      dtype='object')
Code Text

Gemini

After merging and creating the new features, I now have a combined dataset (df_final) that contains customer performance, demographics, and aggregated previous loan information.
Before moving forward, here’s a quick description of the columns in the dataset:

Feature Descriptions for df_final

Column Name Description
customerid Unique identifier for each customer.
systemloanid Unique identifier for each loan transaction in the system.
loannumber Sequential number representing the loan order for a customer.
approveddate Date the loan was approved.
creationdate Date the loan record was created in the system.
loanamount Amount of money borrowed for the current loan.
totaldue Total amount to be repaid (principal + interest).
termdays Duration of the loan in days.
good_bad_flag Loan performance indicator (e.g., Binary target: 0 = Bad (defaulted), 1 = Good (did not default)
interest_curr_amount Interest amount charged on the current loan.
interest_curr_rate Interest rate applied to the current loan.
repayment_curr_ratio Ratio of amount repaid to total due for the current loan.
bank_account_type Type of bank account the customer holds (e.g., savings, checking).
longitude_gps Longitude coordinate of the customer’s recorded location.
latitude_gps Latitude coordinate of the customer’s recorded location.
bank_name_clients Name of the bank where the customer holds an account.
employment_status_clients Employment status of the customer.
num_prev_loans Total number of previous loans taken by the customer.
avg_prev_loanamt Average loan amount from previous loans.
total_firstrepaid_late Number of previous loans where the first repayment was late.
total_closed_late Number of previous loans closed later than the agreed term.
avg_prev_repayment_ratio Average repayment ratio for previous loans.
avg_duration_days Average duration (in days) of previous loans.
avg_prev_interest Average interest amount paid in previous loans.
age Customer’s age in years.
Code Text

Gemini

VISUALIZATION OF OUR DATASET

After veryfying that our datset is clean, we move now to visualizing each column in our dataset for analysis. To do this I will split them into numerical, categorical and datetime as we have from our available columns.

Code Text

Gemini

Features Removed and Reasons

I am going to drop some features as I believe that they will not be useful for our prediction and may be redundant and create noise.

Below is a summary of the columns we removed from our dataset before training the predictive model, along with the reasons for their removal:

Column Name Reason for Removal
customerid This is a unique identifier for each customer. It does not have any relationship with loan repayment behavior and provides no predictive signal.
systemloanid This is a unique identifier for each loan transaction. Like customerid, it is purely administrative and carries no predictive value.
approveddate While dates can be useful when transformed into features like “month” or “day of week,” the raw approval date itself is too granular and will not generalize well in predictions. Instead, derived features are preferred.
creationdate Same reasoning as approveddate — raw date values have limited predictive value, but derived temporal patterns may be useful.
longitude_gps Raw GPS coordinates are too detailed to be directly useful
latitude_gps Raw GPS coordinates are too detailed to be directly useful
loannumber Sequential loan count per customer; dropped to prevent data leakage and redundancy with num_prev_loans
bank_name_clients It acst as an identifier and usually doesn't provide meaningful predictive information for loan default risk

Before I visualize, I will drop them.

Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3269 entries, 0 to 3268
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   loanamount                 3269 non-null   float64
 1   totaldue                   3269 non-null   float64
 2   termdays                   3269 non-null   int64  
 3   good_bad_flag              3269 non-null   object 
 4   approved_time              3269 non-null   object 
 5   creation_time              3269 non-null   object 
 6   bank_account_type          3269 non-null   object 
 7   employment_status_clients  3269 non-null   object 
 8   birth_year                 3269 non-null   int32  
 9   num_prev_loans             3269 non-null   float64
 10  avg_prev_loanamt           3269 non-null   float64
 11  avg_repay_delay_days       3269 non-null   float64
 12  total_firstrepaid_late     3269 non-null   float64
 13  total_closed_late          3269 non-null   float64
 14  avg_prev_repayment_ratio   3269 non-null   float64
 15  avg_duration_days          3269 non-null   float64
 16  avg_prev_interest          3269 non-null   float64
 17  age                        3269 non-null   int64  
 18  interest_curr_amount       3269 non-null   float64
 19  interest_curr_rate         3269 non-null   float64
 20  repayment_curr_ratio       3269 non-null   float64
 21  repayment_efficiency       3269 non-null   float64
 22  late_payment_rate          3269 non-null   float64
dtypes: float64(15), int32(1), int64(2), object(5)
memory usage: 574.8+ KB
Code Text

Gemini
Code Text

Gemini
['loanamount',
 'totaldue',
 'termdays',
 'birth_year',
 'num_prev_loans',
 'avg_prev_loanamt',
 'avg_repay_delay_days',
 'total_firstrepaid_late',
 'total_closed_late',
 'avg_prev_repayment_ratio',
 'avg_duration_days',
 'avg_prev_interest',
 'age',
 'interest_curr_amount',
 'interest_curr_rate',
 'repayment_curr_ratio',
 'repayment_efficiency',
 'late_payment_rate']
Code Text

Gemini
['approved_time',
 'creation_time',
 'bank_account_type',
 'employment_status_clients']
Code Text

Gemini

### THE NUMERICAL COLUMNS

Code Text

Gemini
Code Text

Gemini

OBSERVATION FROM THE VISUALIZATION OF THE NUMERICAL COLUMN

  • loanamount: is slightly skewed to the right with little outlier which is normal cause the loanamount can be higher in a real sense
  • totaldue: is skewed to the right with outliers
  • termdays: is actully very okay
  • interest_curr_amount: is a slightly skewed to the left with outliers
  • interest-curr_rate: is very okay with no outlier
  • repayment_curr-ratio: is very okay with no outlier
  • monthly_curr_due: is okay with little outliers
Code Text

Gemini

THE CATEORICAL COLUMNS

Code Text

Gemini
# Visualizing the distribution of 2 categorical columns
fig, axes = plt.subplots(12, figsize=(156))  # 1 row, 2 columns

for i, col in enumerate(cat_cols[:2]):  # pick only the first 2 categorical columns
    sns.countplot(ax=axes[i], data=df_final, x=col)
    axes[i].set_title(f'Distribution of {col}')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()
Code Text

Gemini

THE DESCRIPTIVE STATISTICS AND CORRELATION MATRIX

Code Text

Gemini
# descriptive statistics
display(df_final.describe().T.round(2))

# correlation matrix
plt.figure(figsize=(128))
sns.heatmap(df_final[num_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Features')
plt.show()
Code Text

Gemini

THE TARGET COLUMN

Code Text

Gemini

Next, I move to visualizing the target column, i.,e, good_bad_flag which is the Loan performance indicator (e.g., Binary target: 0 = Bad (defaulted), 1 = Good (did not default)).

Code Text

Gemini
# Target Column 
df_final.good_bad_flag.value_counts().plot(kind='bar')
Code Text

Gemini

THE DESCRIPTIVE STATISTICS AND CORRELATION MATRIX

Code Text

Gemini
# unique value of the target column
df_final.good_bad_flag.unique()
array(['Good', 'Bad'], dtype=object)
Code Text

Gemini
Code Text

Gemini
array([1, 0])
Code Text

Gemini
# Target Column 
df_final.good_bad_flag.value_counts().plot(kind='bar')
Code Text

Gemini
num_cols
['loanamount',
 'totaldue',
 'termdays',
 'birth_year',
 'num_prev_loans',
 'avg_prev_loanamt',
 'avg_repay_delay_days',
 'total_firstrepaid_late',
 'total_closed_late',
 'avg_prev_repayment_ratio',
 'avg_duration_days',
 'avg_prev_interest',
 'age',
 'interest_curr_amount',
 'interest_curr_rate',
 'repayment_curr_ratio',
 'repayment_efficiency',
 'late_payment_rate']
Code Text

Gemini
Code Text

Gemini
# checking correlation matrix
heatmap = df_final[num_col].corr()
plt.figure(figsize=(2010))
sns.heatmap(data=heatmap, fmt=".2f", annot=True, cmap="coolwarm")
plt.title("Correlation Matrix", fontweight="bold")
plt.show()
Code Text

Gemini

Correlation Analysis Summary - Loan Default Prediction

Key Findings

Strongest Predictors

  • late_payment_rate: -0.28 (best predictor)
  • avg_repay_delay_days: -0.23 (second best)

Moderate Predictors (0.10-0.20)

  • loanamount: 0.12
  • repayment_curr_ratio: 0.12
  • total_firstrepaid_late: -0.14

Weak Predictors (<0.10)

  • termdays: 0.02
  • age: 0.06
  • repayment_efficiency: -0.02

Critical Multicollinearity Issues

Remove these features immediately:

  • totaldue (r=0.99 with loanamount)
  • interest_curr_amount (r=0.88 with termdays)
  • avg_prev_loanamt (r=0.85 with num_prev_loans)
  • total_closed_late (r=0.88 with total_firstrepaid_late)

Implementation

Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
num_col = ['loanamount''good_bad_flag',
 'termdays',
 'repayment_curr_ratio',
 'num_prev_loans',
 'avg_repay_delay_days',
 'total_firstrepaid_late',
 'avg_prev_repayment_ratio',
 'avg_duration_days',
 'avg_prev_interest',
 'age',
 'repayment_efficiency',
 'late_payment_rate']
Code Text

Gemini
Code Text

Gemini

Code Text

Gemini

Code Text

Gemini

# Data Preparation

Code Text

Gemini
# viewing the columns
df_final.columns
Index(['loanamount', 'termdays', 'good_bad_flag', 'approved_time',
       'creation_time', 'bank_account_type', 'employment_status_clients',
       'birth_year', 'num_prev_loans', 'avg_repay_delay_days',
       'total_firstrepaid_late', 'avg_prev_repayment_ratio',
       'avg_duration_days', 'avg_prev_interest', 'age', 'interest_curr_rate',
       'repayment_curr_ratio', 'repayment_efficiency', 'late_payment_rate'],
      dtype='object')
Code Text

Gemini
# separate the features and target
X = df_final.drop(columns='good_bad_flag')
y = df_final['good_bad_flag']
Code Text

Gemini
# separate into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)
Code Text

Gemini

Code Text

Gemini

Data Processing

Code Text

Gemini
# split into cat and num cols
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()
Code Text

Gemini
['approved_time',
 'creation_time',
 'bank_account_type',
 'employment_status_clients']
Code Text

Gemini
['approved_time',
 'creation_time',
 'bank_account_type',
 'employment_status_clients']
Code Text

Gemini

### DATA PREPROCESSING USING A COLUMN TRANSFORMER

Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
Code Text

Gemini
metrics_df
Code Text

Gemini

Code Text

Gemini

Code Text

Gemini

Balancing Target

Code Text

Gemini
results = {}
fig, axes = plt.subplots(23, figsize=(164))
for (name, model), ax in zip(models.items(), axes.flatten()):
    # Pipeline with SMOTE balancing
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # handles encoding/scaling
        ('smote', SMOTE(random_state=42)),  # balances the classes
        ('var_thresh', VarianceThreshold(threshold=0.01)),  # removes low-variance features
        ('classifier', model)  # the model itself
    ])
    
    # Fit on raw training data (SMOTE applied internally to training data only)
    pipeline.fit(X_train, y_train)
    
    # Predictions
    train_pred = pipeline.predict(X_train)
    test_pred = pipeline.predict(X_test)
    
    # Determine if binary or multi-class
    n_classes = len(np.unique(y_train))
    is_binary = n_classes == 2
    
    # Compute metrics (simplified for binary classification)
    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)
    precision = precision_score(y_test, test_pred)
    recall = recall_score(y_test, test_pred)
    f1 = f1_score(y_test, test_pred)
    
    # ROC AUC (simplified for binary classification)
    if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
        test_pred_proba = pipeline.predict_proba(X_test)[:, 1]  # positive class probability
        roc_auc = roc_auc_score(y_test, test_pred_proba)
    else:
        roc_auc = None
    
    results[name] = {
        "Train Accuracy": train_acc,
        "Test Accuracy": test_acc,
        "Precision Score": precision,
        "Recall Score": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc
    }
    
    # Printing classification report for each model
    print(f"\nDetailed Classification Report for {name}:")
    print("=" * 60)
    print(classification_report(y_test, test_pred, target_names=['0 (Default)''1 (No Default)']))
    
    # Confusion matrix 
    cm = confusion_matrix(y_test, test_pred)  # y_true first, y_pred second
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_train))
    disp.plot(ax=ax, cmap='Blues')
    ax.set_title(f'{name}\nF1: {f1:.3f}')

plt.tight_layout()
plt.show()

# Display results
metrics_df = pd.DataFrame(results).T  # Transpose for models as rows
print("\nModel Performance (After SMOTE Balancing):")
print("=" * 55)
print(metrics_df.round(3))

# Highlight best performing models
print("\nBest Models by Metric:")
print("=" * 30)
for metric in metrics_df.columns:
    if metric != 'ROC AUC' or metrics_df[metric].notna().any():
        best_model = metrics_df[metric].idxmax()
        best_score = metrics_df.loc[best_model, metric]
        print(f"{metric:<15}{best_model:<20} ({best_score:.3f})")

# Show class distribution analysis
print(f"\nOriginal Class Distribution:")
print("=" * 35)
unique, counts = np.unique(y_train, return_counts=True)
for class_label, count in zip(unique, counts):
    percentage = (count / len(y_train)) * 100
    print(f"Class {class_label}{count:>6} samples ({percentage:>5.1f}%)")

imbalance_ratio = max(counts) / min(counts)
print(f"\nOriginal Imbalance Ratio: {imbalance_ratio:.2f}:1")
print("✅ SMOTE applied - classes are now balanced in training data")
Code Text

Gemini
# Preprocess first
X_train_processed = preprocessor.fit_transform(X_train)

# Apply SMOTE
X_resampled, y_resampled = pipeline.named_steps['smote'].fit_resample(X_train_processed, y_train)

# Check balance
print(y_resampled.value_counts())
good_bad_flag
1    2045
0    2045
Name: count, dtype: int64
Code Text

Gemini
# Preprocess first
X_test_processed = preprocessor.fit_transform(X_test)

# Apply SMOTE
X_resampled, y_resampled = pipeline.named_steps['smote'].fit_resample(X_test_processed, y_test)

# Check balance
print(y_resampled.value_counts())
good_bad_flag
0    511
1    511
Name: count, dtype: int64
Code Text

Gemini
metrics_df
Code Text

Gemini
df_final.columns
Index(['loanamount', 'termdays', 'good_bad_flag', 'approved_time',
       'creation_time', 'bank_account_type', 'employment_status_clients',
       'birth_year', 'num_prev_loans', 'avg_repay_delay_days',
       'total_firstrepaid_late', 'avg_prev_repayment_ratio',
       'avg_duration_days', 'avg_prev_interest', 'age', 'interest_curr_rate',
       'repayment_curr_ratio', 'repayment_efficiency', 'late_payment_rate'],
      dtype='object')
Code Text

Gemini
df_final['sqrt_late_payment_rate'] = np.sqrt(df_final['late_payment_rate'])
df_final['sqrt_termdays'] = np.sqrt(df_final['termdays'])
df_final['sqrt_loanamount'] = np.sqrt(df_final['loanamount'])
df_final['sqrt_avg_prev_interest'] = np.sqrt(df_final['avg_prev_interest'])
df_final['sqrt_repayment_efficiency'] = np.sqrt(df_final['repayment_efficiency'])
Code Text

Gemini
target_columns = ['sqrt_late_payment_rate''sqrt_termdays''sqrt_loanamount''sqrt_avg_prev_interest''sqrt_repayment_efficiency']

fig, axes = plt.subplots(32, figsize=(158))  # 3x2 grid
axes = axes.flatten()  # flatten into 1D array for easy looping

for i, col in enumerate(target_columns):
    sns.histplot(data=df_final, x=col, bins=30, kde=True, edgecolor='black', alpha=0.7, ax=axes[i])
    axes[i].set_title(f"Distribution of {col}")

# Hide the unused last subplot
fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()
Code Text

Gemini
target_columns = ['sqrt_late_payment_rate''sqrt_termdays''sqrt_loanamount''sqrt_avg_prev_interest''sqrt_repayment_efficiency']

fig, axes = plt.subplots(32, figsize=(158))  # 3x2 grid
axes = axes.flatten()  # flatten into 1D array for easy looping

for i, col in enumerate(target_columns):
    sns.boxplot(data=df_final, x=col, ax=axes[i])
    axes[i].set_title(f"Distribution of {col}")

# Hide the unused last subplot
fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()
Code Text

Gemini
# Cap extreme values
df_final['repayment_efficiency'] = df_final['repayment_efficiency'].clip(upper=df_final['repayment_efficiency'].quantile(0.99))
Code Text

Gemini
sns.boxplot(df_final['repayment_efficiency'])
Code Text

Gemini
target_columns = ['sqrt_late_payment_rate''sqrt_termdays''sqrt_loanamount''sqrt_avg_prev_interest''sqrt_repayment_efficiency''good_bad_flag']
Code Text

Gemini
# checking correlation matrix
heatmap = df_final[target_columns].corr()
plt.figure(figsize=(2010))
sns.heatmap(data=heatmap, fmt=".2f", annot=True, cmap="coolwarm")
plt.title("Correlation Matrix", fontweight="bold")
plt.show()
Code Text

Gemini
df_final.columns
Index(['loanamount', 'termdays', 'good_bad_flag', 'approved_time',
       'creation_time', 'bank_account_type', 'employment_status_clients',
       'birth_year', 'num_prev_loans', 'avg_repay_delay_days',
       'total_firstrepaid_late', 'avg_prev_repayment_ratio',
       'avg_duration_days', 'avg_prev_interest', 'age', 'interest_curr_rate',
       'repayment_curr_ratio', 'repayment_efficiency', 'late_payment_rate',
       'sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount',
       'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency'],
      dtype='object')
Code Text

Gemini
# prediction 

X = df_final.drop(columns=['late_payment_rate''termdays''good_bad_flag''loanamount''avg_prev_interest''repayment_efficiency'])
y = df_final["good_bad_flag"]
Code Text

Gemini
# separate into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)
Code Text

Gemini
# split into cat and num cols
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()
Code Text

Gemini
num_cols
['birth_year',
 'num_prev_loans',
 'avg_repay_delay_days',
 'total_firstrepaid_late',
 'avg_prev_repayment_ratio',
 'avg_duration_days',
 'age',
 'interest_curr_rate',
 'repayment_curr_ratio',
 'sqrt_late_payment_rate',
 'sqrt_termdays',
 'sqrt_loanamount',
 'sqrt_avg_prev_interest',
 'sqrt_repayment_efficiency']
Code Text

Gemini
cat_cols
['approved_time',
 'creation_time',
 'bank_account_type',
 'employment_status_clients']
Code Text

Gemini
# COLUMN TRANSFORMER PIPELINE
# Numeric transformer
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
# Cstegorical transformer
cat_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Column transformer
preprocessor = ColumnTransformer(transformers=[
    ('scaled_num', num_transformer, num_cols),
    ('encoded_cat', cat_transformer, cat_cols),
])


# define our model
models = {
    'Logistic Regression' : LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    'XGBoost' : XGBClassifier(random_state=42),
    "Gradient Boost": GradientBoostingClassifier(random_state=42),
    "Cat Boost" : CatBoostClassifier(random_seed=42, verbose=0)
}
Code Text

Gemini
Code Text

Gemini
results = {}
fig, axes = plt.subplots(23, figsize=(164))
for (name, model), ax in zip(models.items(), axes.flatten()):
    # Pipeline with SMOTE balancing
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # handles encoding/scaling
        ('smote', SMOTE(random_state=42)),  # balances the classes
        ('var_thresh', VarianceThreshold(threshold=0.01)),  # removes low-variance features
        ('classifier', model)  # the model itself
    ])
    
    # Fit on raw training data (SMOTE applied internally to training data only)
    pipeline.fit(X_train, y_train)
    
    # Predictions
    train_pred = pipeline.predict(X_train)
    test_pred = pipeline.predict(X_test)
    
    # Determine if binary or multi-class
    n_classes = len(np.unique(y_train))
    is_binary = n_classes == 2
    
    # Compute metrics (simplified for binary classification)
    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)
    precision = precision_score(y_test, test_pred)
    recall = recall_score(y_test, test_pred)
    f1 = f1_score(y_test, test_pred)
    
    # ROC AUC (simplified for binary classification)
    if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
        test_pred_proba = pipeline.predict_proba(X_test)[:, 1]  # positive class probability
        roc_auc = roc_auc_score(y_test, test_pred_proba)
    else:
        roc_auc = None
    
    results[name] = {
        "Train Accuracy": train_acc,
        "Test Accuracy": test_acc,
        "Precision Score": precision,
        "Recall Score": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc
    }
    
    # Printing classification report for each model
    print(f"\nDetailed Classification Report for {name}:")
    print("=" * 60)
    print(classification_report(y_test, test_pred, target_names=['0 (Default)''1 (No Default)']))
    
    # Confusion matrix 
    cm = confusion_matrix(y_test, test_pred)  # y_true first, y_pred second
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_train))
    disp.plot(ax=ax, cmap='Blues')
    ax.set_title(f'{name}\nF1: {f1:.3f}')

plt.tight_layout()
plt.show()

# Display results
metrics_df = pd.DataFrame(results).T  # Transpose for models as rows
print("\nModel Performance (After SMOTE Balancing):")
print("=" * 55)
print(metrics_df.round(3))

# Highlight best performing models
print("\nBest Models by Metric:")
print("=" * 30)
for metric in metrics_df.columns:
    if metric != 'ROC AUC' or metrics_df[metric].notna().any():
        best_model = metrics_df[metric].idxmax()
        best_score = metrics_df.loc[best_model, metric]
        print(f"{metric:<15}{best_model:<20} ({best_score:.3f})")

# Show class distribution analysis
print(f"\nOriginal Class Distribution:")
print("=" * 35)
unique, counts = np.unique(y_train, return_counts=True)
for class_label, count in zip(unique, counts):
    percentage = (count / len(y_train)) * 100
    print(f"Class {class_label}{count:>6} samples ({percentage:>5.1f}%)")

imbalance_ratio = max(counts) / min(counts)
print(f"\nOriginal Imbalance Ratio: {imbalance_ratio:.2f}:1")
print("✅ SMOTE applied - classes are now balanced in training data")
Code Text

Gemini
from sklearn.metrics import roc_curve, auc
# ROC Curve Plot
plt.figure(figsize=(108))

colors = ['blue''red''green''orange''purple''brown']

for (name, model), color in zip(models.items(), colors):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('var_thresh', VarianceThreshold(threshold=0.01)),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    
    if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        
        plt.plot(fpr, tpr, color=color, lw=2
                label=f'{name} (AUC = {roc_auc:.3f})')

# Reference line
plt.plot([01], [01], 'k--', alpha=0.7, label='Random (AUC = 0.500)')

plt.xlim([0.01.0])
plt.ylim([0.01.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Code Text

Gemini

Code Text

Gemini

Code Text

Gemini
# Initialize results dictionary
results = {}

# Model name
name = "GradientBoostClassifier"

# Final model
final_model = GradientBoostingClassifier(random_state=42)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('var_thresh', VarianceThreshold(threshold=0.01)),
    ('classifier', final_model)
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predictions
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)

# Metrics
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)

if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
    test_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, test_pred_proba)
else:
    roc_auc = None

# Save results
results[name] = {
    "Train Accuracy": train_acc,
    "Test Accuracy": test_acc,
    "Precision Score": precision,
    "Recall Score": recall,
    "F1 Score": f1,
    "ROC AUC": roc_auc
}

# === PRINT RESULTS ===
print(f"\n📊 Results for {name}:")
for metric, value in results[name].items():
    if value is not None:
        print(f"{metric}{value:.4f}")
    else:
        print(f"{metric}: None")

# === TABLE VIEW OF RESULTS ===
df_results = pd.DataFrame(results).T  # Models as rows
print("\n📋 All Results Table:")
print(df_results)

# === CLASSIFICATION REPORT ===
print("\n📑 Classification Report:")
print(classification_report(
    y_test, test_pred,
    target_names=[str(lbl) for lbl in np.unique(y_train)]
))

# === CONFUSION MATRIX PLOT ===
fig, ax = plt.subplots()
cm = confusion_matrix(y_test, test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_train))
disp.plot(ax=ax, cmap='Blues')
ax.set_title(f'{name}\nF1: {f1:.3f}')
plt.tight_layout()
plt.show()

Code Text

Gemini
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Gradient Boost': GradientBoostingClassifier(random_state=42),
    'Cat Boost': CatBoostClassifier(random_state=42, verbose=False)
}

# 5-fold stratified cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Stratified CV Results:")
print("=" * 70)

# Store results for ranking
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('var_thresh', VarianceThreshold(threshold=0.01)),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])
    
    # Multiple metrics for comprehensive evaluation
    roc_scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='roc_auc', n_jobs=-1)
    pr_scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='average_precision', n_jobs=-1)
    f1_scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='f1', n_jobs=-1)
    
    # Calculate means and stability
    roc_mean, roc_std = roc_scores.mean(), roc_scores.std()
    pr_mean, pr_std = pr_scores.mean(), pr_scores.std()
    f1_mean, f1_std = f1_scores.mean(), f1_scores.std()
    
    stability = "Stable" if roc_std < 0.05 else "Unstable"
    
    # Store all metrics
    results[name] = {
        'roc_mean': roc_mean, 'roc_std': roc_std,
        'pr_mean': pr_mean, 'pr_std': pr_std,
        'f1_mean': f1_mean, 'f1_std': f1_std
    }
    
    print(f"{name:<20}: ROC={roc_mean:.3f}{roc_std:.3f}| PR-AUC={pr_mean:.3f}{pr_std:.3f}| F1={f1_mean:.3f}{f1_std:.3f}) [{stability}]")

# Model selection recommendations
print("\n" + "=" * 70)
print("MODEL SELECTION RECOMMENDATIONS:")
print("-" * 40)

# Rank by PR-AUC (best for imbalanced data) with ROC as tie-breaker
sorted_by_pr = sorted(results.items(), key=lambda x: (x[1]['pr_mean'], x[1]['roc_mean']), reverse=True)
sorted_by_roc = sorted(results.items(), key=lambda x: x[1]['roc_mean'], reverse=True)

print("Rankings:")
print(f"🎯 Best by PR-AUC: {sorted_by_pr[0][0]} (PR={sorted_by_pr[0][1]['pr_mean']:.3f})")
print(f"🏆 Best by ROC-AUC: {sorted_by_roc[0][0]} (ROC={sorted_by_roc[0][1]['roc_mean']:.3f})")

# Overall recommendation (PR-AUC primary for imbalanced data)
best_model = sorted_by_pr[0][0]
best_pr_score = sorted_by_pr[0][1]['pr_mean']
print(f"✅ RECOMMENDED: {best_model} (Focus on PR-AUC for imbalanced data)")

# Top candidates for hyperparameter tuning
tuning_candidates = [name for name, scores in sorted_by_pr 
                    if scores['pr_mean'] >= best_pr_score - 0.02]

print(f"🔧 Tune these models: {', '.join(tuning_candidates)}")
print(f"📈 Expected PR-AUC after tuning: {best_pr_score + 0.02:.3f} - {best_pr_score + 0.05:.3f}")

# Save for next steps
best_models = dict(sorted_by_pr[:3])
print(f"💾 Top 3 saved for further analysis: {list(best_models.keys())}")

print("\n💡 TIP: PR-AUC is better than ROC-AUC for imbalanced datasets!")
Stratified CV Results:
======================================================================
Logistic Regression : ROC=0.698(±0.025) | PR-AUC=0.877(±0.014) | F1=0.789(±0.008) [Stable]
Decision Tree       : ROC=0.572(±0.028) | PR-AUC=0.808(±0.010) | F1=0.796(±0.016) [Stable]
Random Forest       : ROC=0.653(±0.038) | PR-AUC=0.856(±0.019) | F1=0.832(±0.010) [Stable]
XGBoost             : ROC=0.655(±0.042) | PR-AUC=0.861(±0.023) | F1=0.847(±0.007) [Stable]
Gradient Boost      : ROC=0.678(±0.039) | PR-AUC=0.868(±0.020) | F1=0.846(±0.010) [Stable]
Cat Boost           : ROC=0.669(±0.049) | PR-AUC=0.865(±0.025) | F1=0.855(±0.013) [Stable]

======================================================================
MODEL SELECTION RECOMMENDATIONS:
----------------------------------------
Rankings:
🎯 Best by PR-AUC: Logistic Regression (PR=0.877)
🏆 Best by ROC-AUC: Logistic Regression (ROC=0.698)
✅ RECOMMENDED: Logistic Regression (Focus on PR-AUC for imbalanced data)
🔧 Tune these models: Logistic Regression, Gradient Boost, Cat Boost, XGBoost
📈 Expected PR-AUC after tuning: 0.897 - 0.927
💾 Top 3 saved for further analysis: ['Logistic Regression', 'Gradient Boost', 'Cat Boost']

💡 TIP: PR-AUC is better than ROC-AUC for imbalanced datasets!
Code Text

Gemini
Code Text

Gemini
# Initialize results dictionary
results = {}

# Model name
name = "GradientBoosting"

# Final model
final_model = GradientBoostingClassifier(random_state=42)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('var_thresh', VarianceThreshold(threshold=0.01)),
    ('classifier', final_model)
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predictions
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)

# Metrics
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred)
recall = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)

if hasattr(pipeline.named_steps['classifier'], "predict_proba"):
    test_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, test_pred_proba)
else:
    roc_auc = None

# Save results
results[name] = {
    "Train Accuracy": train_acc,
    "Test Accuracy": test_acc,
    "Precision Score": precision,
    "Recall Score": recall,
    "F1 Score": f1,
    "ROC AUC": roc_auc
}

# === PRINT RESULTS ===
print(f"\n📊 Results for {name}:")
for metric, value in results[name].items():
    if value is not None:
        print(f"{metric}{value:.4f}")
    else:
        print(f"{metric}: None")

# === TABLE VIEW OF RESULTS ===
df_results = pd.DataFrame(results).T  # Models as rows
print("\n📋 All Results Table:")
print(df_results)

# === CLASSIFICATION REPORT ===
print("\n📑 Classification Report:")
print(classification_report(
    y_test, test_pred,
    target_names=[str(lbl) for lbl in np.unique(y_train)]
))

# === CONFUSION MATRIX PLOT ===
fig, ax = plt.subplots()
cm = confusion_matrix(y_test, test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_train))
disp.plot(ax=ax, cmap='Blues')
ax.set_title(f'{name}\nF1: {f1:.3f}')
plt.tight_layout()
plt.show()

Code Text

Gemini

Code Text

Gemini

Code Text

Gemini
# ===== 1. Fit the pipeline =====
pipeline.fit(X_train, y_train)

# ===== 2. Extract trained CatBoost model =====
model = pipeline.named_steps['classifier']

# ===== 3. Transform X_train with preprocessing only =====
X_train_processed = pipeline.named_steps['preprocessor'].transform(X_train)

# ===== 4. Get proper feature names from the preprocessor =====
# This works for most ColumnTransformer setups with OneHotEncoder
def get_feature_names(preprocessor):
    feature_names = []
    for name, transformer, columns in preprocessor.transformers_:
        if transformer == 'drop':
            continue
        if hasattr(transformer, 'named_steps'):  # e.g., Pipeline inside ColumnTransformer
            transformer = transformer.named_steps[next(iter(transformer.named_steps))]
        if hasattr(transformer, 'get_feature_names_out'):
            names = list(transformer.get_feature_names_out(columns))
        else:
            names = list(columns)
        feature_names.extend(names)
    return feature_names

feature_names = get_feature_names(pipeline.named_steps['preprocessor'])

# Convert to DataFrame so SHAP sees column names
X_train_processed_df = pd.DataFrame(X_train_processed, columns=feature_names)

# ===== 5. SHAP Analysis =====
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train_processed_df)

# Summary plots with real feature names
shap.summary_plot(shap_values, X_train_processed_df, plot_type='bar')
shap.summary_plot(shap_values, X_train_processed_df)

# Force plot for one sample
sample_index = 0
shap.force_plot(
    explainer.expected_value,
    shap_values[sample_index, :],
    X_train_processed_df.iloc[sample_index, :]
)
Code Text

Gemini

Code Text

Gemini

Code Text

Gemini
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.pipeline import Pipeline  # Use imblearn's Pipeline instead
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
import numpy as np

# Same stratified CV setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("🔧 HYPERPARAMETER TUNING - GRADIENT BOOST")
print("=" * 50)

# GRADIENT BOOST TUNING
print("Tuning Gradient Boost...")
print("-" * 30)

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('var_thresh', VarianceThreshold(threshold=0.01)),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

gb_params = {
    'classifier__learning_rate': [0.010.10.2],
    'classifier__n_estimators': [50100200],
    'classifier__max_depth': [357],
    'classifier__subsample': [0.81.0]
}

# Use RandomizedSearchCV for faster search
gb_random = RandomizedSearchCV(
    gb_pipeline, gb_params,
    n_iter=20,  # Try 20 combinations
    cv=skf, scoring='average_precision',  # PR-AUC
    n_jobs=-1, verbose=1, random_state=42
)

gb_random.fit(X_train, y_train)

# Results
original_score = 0.867  # Your original GB PR-AUC
best_score = gb_random.best_score_
improvement = best_score - original_score

print(f"\n🏆 GRADIENT BOOST TUNING RESULTS:")
print("=" * 40)
print(f"✅ Best PR-AUC: {best_score:.3f} (Improvement: +{improvement:.3f})")
print(f"🎛️  Best parameters:")
for param, value in gb_random.best_params_.items():
    print(f"   {param}{value}")

# Save best model
best_gb_model = gb_random.best_estimator_

print(f"\n🎯 NEXT STEP: Test tuned Gradient Boost on holdout test set!")
print("💾 Best model saved as 'best_gb_model' variable")

print(f"\n🔍 Cross-validation confidence: {best_score:.3f}")
print(f"📈 Expected test performance: {best_score - 0.02:.3f} - {best_score:.3f}")
🔧 HYPERPARAMETER TUNING - GRADIENT BOOST
==================================================
Tuning Gradient Boost...
------------------------------
Fitting 5 folds for each of 20 candidates, totalling 100 fits

🏆 GRADIENT BOOST TUNING RESULTS:
========================================
✅ Best PR-AUC: 0.878 (Improvement: +0.011)
🎛️  Best parameters:
   classifier__subsample: 0.8
   classifier__n_estimators: 200
   classifier__max_depth: 3
   classifier__learning_rate: 0.01

🎯 NEXT STEP: Test tuned Gradient Boost on holdout test set!
💾 Best model saved as 'best_gb_model' variable

🔍 Cross-validation confidence: 0.878
📈 Expected test performance: 0.858 - 0.878
Code Text

Gemini
# ===== 1. FINAL MODEL EVALUATION ON TEST SET =====
from sklearn.metrics import (
    roc_auc_score, 
    average_precision_score, 
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
print("🔍 FINAL MODEL EVALUATION")
print("=" * 40)

# Your tuned model is stored in: best_gb_model
final_model = best_gb_model

# Test set predictions
y_test_pred = final_model.predict(X_test)
y_test_proba = final_model.predict_proba(X_test)[:1]

# Calculate final metrics
test_roc_auc = roc_auc_score(y_test, y_test_proba)
test_pr_auc = average_precision_score(y_test, y_test_proba)

print(f"📊 FINAL TEST SET PERFORMANCE:")
print(f"   ROC-AUC: {test_roc_auc:.3f}")
print(f"   PR-AUC:  {test_pr_auc:.3f}")
print(f"   Expected: {best_score - 0.02:.3f} - {best_score:.3f}")

# Classification report
print(f"\n📋 CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred))

# ===== 2. SAVE MODEL FOR DEPLOYMENT =====
print(f"\n💾 SAVING MODEL FOR DEPLOYMENT")
print("=" * 40)

# Save model metadata
model_info = {
    'model_type''GradientBoostingClassifier',
    'cv_score': best_score,
    'test_roc_auc': test_roc_auc,
    'test_pr_auc': test_pr_auc,
    'best_params': gb_random.best_params_,
    'features_shape': X_train.shape[1],
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}
🔍 FINAL MODEL EVALUATION
========================================
📊 FINAL TEST SET PERFORMANCE:
   ROC-AUC: 0.709
   PR-AUC:  0.886
   Expected: 0.858 - 0.878

📋 CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.42      0.50      0.46       143
           1       0.85      0.81      0.83       511

    accuracy                           0.74       654
   macro avg       0.64      0.65      0.64       654
weighted avg       0.76      0.74      0.75       654


💾 SAVING MODEL FOR DEPLOYMENT
========================================
Code Text

Gemini
# fit it on the whole dataset
final_model.fit(X, y)
Code Text

Gemini
# Save the complete trained pipeline
joblib.dump(final_model, 'gradients_boost_models.pkl')
['gradients_boost_models.pkl']
Code Text

Gemini
# Load our saved Model
model = joblib.load("gradients_boost_models.pkl")
Code Text

Gemini
model
Code Text

Gemini

Code Text

Gemini
Code Text

Gemini
df_final.avg_duration_days.unique()
array([ 29.45454545,   0.        ,  18.16666667,  31.5       ,
        27.        ,  19.5       ,  22.11111111,  17.66666667,
        12.        ,  13.        ,  23.        ,  26.6       ,
        21.5       ,  28.        ,  25.        ,  20.78571429,
        16.22222222,  22.5       ,   9.        ,  12.83333333,
         5.        ,  11.5       ,  12.66666667,  21.75      ,
         6.        ,  29.75      ,  24.33333333,  29.        ,
        27.5       ,  24.        ,  14.        ,  24.78571429,
        16.66666667,  22.57142857,  24.125     ,  21.2       ,
         8.        ,  17.85714286,  26.5625    ,  15.66666667,
        24.18181818,  23.54545455,  15.09090909,  19.83333333,
        21.        ,  43.28571429,  17.        ,  11.        ,
        14.6       ,  28.1       ,  36.09090909,  26.4       ,
        19.85714286,  20.5       ,  19.75      ,  26.        ,
        26.5       ,  26.375     ,  16.33333333,  30.        ,
        22.        ,  14.5       ,  33.625     ,  22.33333333,
        54.        ,  20.25      ,  25.66666667,  19.8       ,
        32.        ,  31.        ,  19.66666667,  37.        ,
        20.        ,  26.76470588,  19.        ,  17.18181818,
        14.66666667,  16.5       ,  16.        ,  15.        ,
         3.        ,  26.33333333,  32.5       ,  25.8       ,
        18.        ,   4.        ,  27.85714286,  28.5       ,
        31.18181818,  19.33333333,  34.22222222,  21.8       ,
        17.125     ,  34.        ,  33.        ,  27.33333333,
        29.33333333,  39.58333333,  27.83333333,  29.7       ,
        21.6       ,  22.66666667,  35.55555556,   7.        ,
        14.33333333,  35.88888889,  27.75      ,  24.625     ,
        14.25      ,  17.8       ,  21.33333333,  20.75      ,
        20.125     ,  23.5       ,  24.1       ,  28.2       ,
        28.66666667,  20.71428571,  34.5       ,  26.66666667,
        44.        ,  36.33333333,  15.25      ,  30.35714286,
        31.54545455,  38.        ,  34.66666667,  10.33333333,
        16.375     ,  33.66666667,  30.55555556,  36.88888889,
        29.25      ,  23.77777778,  36.        ,  16.6       ,
        24.69230769,  28.33333333,  17.4       ,  26.25      ,
        24.5       ,  18.25      ,  35.11111111,  38.66666667,
        20.44444444,  34.625     ,  12.75      ,   8.66666667,
        10.66666667,  13.6       ,  17.33333333,   9.66666667,
        23.33333333,   5.81818182,  27.30769231,  11.77777778,
        35.8       ,  33.375     ,  28.42857143,  26.75      ,
        10.        ,  36.57142857,  18.90909091,  28.6       ,
        25.5       ,  42.        ,  27.88888889,  46.25      ,
        30.25      ,  30.11111111,  53.        ,  28.71428571,
        10.5       ,   8.16666667,  16.4       ,  22.22222222,
        43.        ,  26.1875    ,  30.33333333,  37.44444444,
        25.55555556,  19.25      ,  28.16666667,  41.        ,
        37.125     ,  20.81818182,  17.42857143,  42.14285714,
        27.375     ,  19.54545455,  19.14285714,  30.75      ,
        27.44444444,  20.4       ,  10.55555556,  11.33333333,
        33.22222222,  18.6       ,  23.66666667,  32.66666667,
        24.75      ,  32.44444444,  20.66666667,  17.5       ,
        22.4       ,  19.81818182,  23.71428571,  25.33333333,
        29.16666667,  18.83333333,  27.7       ,  13.25      ,
        27.77777778,  31.83333333,  18.2       ,  16.875     ,
        27.21428571,  16.25      ,  29.5       ,  25.2       ,
        17.61538462,  30.57142857,  12.5       ,  16.14285714,
        15.57142857,  28.14285714,   9.5       ,  46.5       ,
        35.14285714,  22.25      ,  24.66666667,  18.5       ,
        39.        ,  27.23076923,  26.83333333,  38.33333333,
        28.72727273,  10.4       ,  21.25      ,  29.66666667,
        25.64285714,  13.33333333,  26.63636364,  33.85714286,
        17.25      ,  17.81818182,  25.25      ,  27.8       ,
         9.14285714,  27.54545455,  14.75      ,  29.07692308,
        34.33333333,  30.3       ,  29.625     ,  20.6       ,
        26.8       ,  32.85714286,  27.16666667,  30.72727273,
        11.14285714,  56.        ,  37.25      ,  30.83333333,
        32.11111111,  18.66666667,   9.16666667,  20.625     ,
        20.57142857,  17.38461538,  26.88888889,  27.66666667,
        38.5       ,  26.7       ,  13.2       ,  20.54545455,
        24.16666667,  13.375     ,  28.4       ,  23.81818182,
        17.71428571,  35.5       ,  30.42857143,  13.66666667,
        36.11111111,  26.14285714,  21.83333333,  18.22222222,
        61.6       ,  22.75      ,  31.88888889,  29.6       ,
        32.25      ,  25.44444444,  55.66666667,  29.28571429,
        11.75      ,  31.375     ,  17.45454545,   8.33333333,
        26.875     ,  25.81818182,  14.16666667,  30.22222222,
        13.85714286,  17.90909091,  21.66666667,  18.45454545,
        26.45454545,  46.        ,  30.66666667,  25.9       ,
        31.85714286,  19.91666667,  15.28571429,   2.        ,
        16.83333333,  23.69230769,  47.        ,  36.5       ,
        17.16666667,  50.        ,  31.28571429,  15.42857143,
        10.83333333,   9.22222222,  25.72727273,  40.8       ,
        27.90909091,  35.        ,  15.46153846,  24.08333333,
        29.85714286,  11.66666667,  30.5       ,  56.4       ,
         7.25      ,  24.25      ,  19.53333333,  38.71428571,
        23.08333333,  26.16666667,  18.57142857,  20.45454545,
        27.25      ,  13.71428571,  28.75      ,  26.81818182,
        31.08333333,  16.07142857,  15.2       ,  33.5       ,
        24.58333333,  19.625     ,  10.42857143,  29.14285714,
        23.57142857,  13.5       ,   6.5       ,  12.33333333,
        22.14285714,  37.14285714,  21.77777778,  27.07142857,
         7.5       ,  30.28571429,  27.4       ,  25.125     ,
        10.8       ,  15.1       ,  22.6       ,  49.        ,
        45.        ,  21.90909091, 146.        ,  26.2       ,
        29.91666667,  31.25      ,  21.86666667,  25.88888889,
        28.83333333,  18.4       ,  20.88888889,  18.88888889,
        34.125     ,  19.27777778,  23.25      ,  25.77777778,
        29.22222222,  24.375     ,  20.16666667,  20.33333333,
        32.125     ,  20.55555556,  34.8       ,  26.42857143,
        15.33333333,  15.7       ,  40.        ,  45.6       ,
        21.63636364,  25.57142857,  28.22222222,  26.09090909,
        12.42857143,  28.77777778,  12.4       ,  39.33333333,
        14.2       ,  15.77777778,  32.9       ,  18.8       ,
        17.77777778,  54.5       ,  29.88888889,  12.69230769,
        23.75      ,  19.71428571,  30.09090909,  44.5       ,
        17.26666667,  37.33333333,  17.6       ,  30.2       ,
        35.33333333,  28.8       ,  27.2       ,  20.14285714,
         9.25      ,  24.57142857,  26.08333333,  29.4       ,
        21.44444444,  21.9       ,  19.21428571,  18.75      ,
        25.75      ,  13.93333333,  10.6       ,  39.5       ,
        42.8       ,  31.42857143,  23.28571429,  17.375     ,
        37.375     ,   6.57692308,  34.14285714,  28.63636364,
        16.55555556,  22.42857143,  42.28571429,  23.9375    ,
        19.73333333,  37.5       ,  13.28571429,  27.14285714,
        22.71428571,  19.27272727,  24.88888889,  26.91666667,
        28.25      ,  23.61538462,  41.6       ,  10.93333333,
        15.5       ,  22.08333333,  19.55555556,  23.4       ,
        43.33333333,  57.        ,  25.42857143,  21.54545455,
        24.2       ,  22.8       ,  15.4       ,  15.75      ,
        21.28571429,  36.75      ,  34.42857143,  54.16666667,
        18.125     ,  41.5       ,  32.75      ,   6.66666667,
        21.3       ,  22.83333333,   8.14285714,  19.2       ,
        14.42857143,  36.25      ,  19.64705882,  32.4       ,
        28.78571429,  24.4       ,  35.81818182,  21.36363636,
        31.22222222,  11.85714286,  17.4375    ,  20.42857143,
        27.63636364,  36.72727273,  27.28571429,  16.75      ,
        13.16666667,  29.125     ,  52.        ,  32.7       ,
        32.33333333,  25.14285714,  12.6       ,  22.28571429,
        14.57142857,  18.05263158,  16.71428571,  22.2       ,
        25.71428571,  26.71428571,  32.72727273,  63.        ,
        20.7       ,  29.84615385,  17.88888889,  31.14285714,
         8.5       ,  18.33333333,  33.33333333,  28.55555556,
        20.83333333,   7.2       ,  27.45454545,  31.72727273,
        25.27777778,  25.78571429,  11.63636364,  33.88888889,
        25.83333333,  25.09090909,  11.25      ,  23.875     ,
        25.18181818,  29.42857143,  30.85714286,  29.27272727,
        29.9       ,  14.55555556,  21.71428571,  20.76190476,
        27.5625    ,  30.625     ,  19.6       ,   4.66666667,
        14.14285714,  13.83333333,  37.4       ,  19.44444444,
        58.        ,  20.07142857,  21.85714286,  16.90909091,
        20.11111111,  18.06666667,  21.125     ,  41.85714286,
        21.13333333,  12.25      ,  22.3       ,  29.09090909,
        17.44444444,  24.23076923,  33.42857143,  22.84615385,
        24.55555556,  20.85714286,  48.75      ,  13.625     ,
        19.41666667,  17.2       ,  53.2       ,  13.42857143,
        11.6       ,  31.625     ,  18.9047619 ,  19.16666667,
        16.2       ,  22.09090909,  24.6       ,  23.3       ,
        22.76923077,  20.2       ,  29.71428571,  25.4       ,
        21.88888889,  19.125     ,  12.375     ,  24.45454545,
         7.36363636,  20.92857143,  29.875     ,  31.6       ,
        37.6       ,  21.4       ,  18.27272727,  33.11111111,
        21.1       ,  11.4       ,  22.30769231,  31.92307692,
        24.91666667,  27.57142857,  32.6       ,  22.63636364,
        11.45454545,  22.55555556,  17.75      ,  22.44444444,
        10.22727273,  24.07692308,  27.9       ,  32.16666667,
        31.2       ,  31.66666667,  19.42857143,  24.81818182,
        28.125     ,  24.77777778,  25.875     ,  25.91666667,
        15.16666667,  18.875     ,  29.375     ,  26.41666667,
        32.83333333,  19.18181818,  29.36363636,  23.83333333,
        37.8       ,  20.8       ,  20.46666667,  24.83333333,
         8.52941176,  15.61538462,  29.2       ,  14.83333333,
        28.21428571,  12.28571429,  13.55555556,  16.28571429,
        17.83333333,  23.18181818,  23.14285714,  16.44444444,
        18.28571429,   1.        ,  12.8       ,  28.375     ,
        16.42857143,  25.07692308,  23.26666667,   6.625     ,
        25.6       ,  37.625     ,  18.07142857,  26.57142857,
        23.11111111,  28.88888889,  25.85714286,  20.875     ,
         6.75      ,  27.6       ,  22.27272727,  17.22222222,
        24.92307692,  38.54545455,  23.92857143,  33.63636364,
        23.84615385,  22.88888889,  24.85714286,  34.6       ,
        22.875     ,  31.4       ,  31.16666667,  21.57142857,
        51.        ,  15.8       ,  28.85714286,  15.85714286,
        16.16666667,  15.26666667,  34.16666667,  15.92307692,
        75.66666667,  20.91666667,  24.71428571,  17.875     ,
        42.6       ,  30.08333333,  31.55555556,  23.22222222,
        24.8       ,  48.        ,  30.38461538,  16.88888889,
        31.76923077,  19.63157895,  30.92857143,  43.2       ,
        22.625     ,  21.16666667,  34.88888889,  13.72727273,
        28.38461538,  14.61904762,  25.375     ,  42.5       ,
        13.08333333,  16.8       ,  48.66666667,  14.77777778,
        31.71428571,  19.375     ,  17.23076923,   6.33333333,
        32.8       ,  19.72222222,  19.77777778,  61.        ,
        22.7       ,   5.66666667,  62.        ,  28.28571429,
        13.75      ,  34.55555556,  13.8       ,  27.53846154,
        27.92307692,  30.125     ,  31.77777778,  23.76923077,
        28.44444444,  29.57142857,  22.375     ,  22.125     ,
        21.09090909,  33.08333333,  23.16666667,  19.09090909,
        32.1       ,  23.88888889,  71.5       ,  26.22222222,
        20.09090909,  35.16666667,  33.55555556,   6.4       ,
        30.81818182,  31.125     ,  25.16666667,  21.42857143,
        33.2       ,  17.09090909,  14.28571429,  32.2       ,
        17.3       ,  31.33333333,  18.625     ,  10.91666667,
        24.28571429,  21.41666667,  30.14285714,  13.57142857,
        14.71428571,  23.42857143,  28.61538462, 151.        ,
        13.14285714,  24.22222222,  37.75      ,  45.57142857])
Code Text

Gemini

Code Text

Gemini

Code Text

Gemini

Code Text

Gemini

Code Text

Gemini
# ===== COMPREHENSIVE MODEL EVALUATION =====
print("\n🎯 COMPREHENSIVE MODEL PERFORMANCE OVERVIEW")
print("=" * 50)

# Additional metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Calculate all key metrics
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"📈 DETAILED METRICS:")
print(f"   Accuracy:  {accuracy:.3f}")
print(f"   Precision: {precision:.3f}")
print(f"   Recall:    {recall:.3f}")
print(f"   F1-Score:  {f1:.3f}")
print(f"   ROC-AUC:   {test_roc_auc:.3f}")
print(f"   PR-AUC:    {test_pr_auc:.3f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print(f"\n🔢 CONFUSION MATRIX:")
print(f"   True Negatives:  {cm[0,0]}")
print(f"   False Positives: {cm[0,1]}")
print(f"   False Negatives: {cm[1,0]}")
print(f"   True Positives:  {cm[1,1]}")
🎯 COMPREHENSIVE MODEL PERFORMANCE OVERVIEW
==================================================
📈 DETAILED METRICS:
   Accuracy:  0.740
   Precision: 0.852
   Recall:    0.808
   F1-Score:  0.829
   ROC-AUC:   0.709
   PR-AUC:    0.886

🔢 CONFUSION MATRIX:
   True Negatives:  71
   False Positives: 72
   False Negatives: 98
   True Positives:  413
Code Text

Gemini
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, precision_recall_curve

# Create a comprehensive evaluation plot
fig, axes = plt.subplots(22, figsize=(1512))
fig.suptitle('Model Performance Evaluation', fontsize=16, fontweight='bold')

# 1. ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
axes[00].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {test_roc_auc:.3f})')
axes[00].plot([01], [01], color='navy', lw=2, linestyle='--', alpha=0.8)
axes[00].set_xlim([0.01.0])
axes[00].set_ylim([0.01.05])
axes[00].set_xlabel('False Positive Rate')
axes[00].set_ylabel('True Positive Rate')
axes[00].set_title('ROC Curve')
axes[00].legend(loc="lower right")
axes[00].grid(True, alpha=0.3)

# 2. Precision-Recall Curve
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_test_proba)
axes[01].plot(recall_vals, precision_vals, color='blue', lw=2, label=f'PR curve (AUC = {test_pr_auc:.3f})')
axes[01].set_xlim([0.01.0])
axes[01].set_ylim([0.01.05])
axes[01].set_xlabel('Recall')
axes[01].set_ylabel('Precision')
axes[01].set_title('Precision-Recall Curve')
axes[01].legend(loc="upper right")
axes[01].grid(True, alpha=0.3)

# 3. Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues'
            xticklabels=['Predicted 0''Predicted 1'],
            yticklabels=['Actual 0''Actual 1'], ax=axes[10])
axes[10].set_title('Confusion Matrix')

# 4. Feature Importance (if available)
if hasattr(final_model, 'feature_importances_'):
    # Get top 10 most important features
    feature_names = X_train.columns if hasattr(X_train, 'columns'else [f'Feature_{i}' for i in range(X_train.shape[1])]
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': final_model.feature_importances_
    }).sort_values('importance', ascending=False).head(10)
    
    axes[11].barh(range(len(importance_df)), importance_df['importance'])
    axes[11].set_yticks(range(len(importance_df)))
    axes[11].set_yticklabels(importance_df['feature'])
    axes[11].set_xlabel('Feature Importance')
    axes[11].set_title('Top 10 Feature Importances')
    axes[11].invert_yaxis()

plt.tight_layout()
plt.show()
Code Text

Gemini

Code Text

Gemini
import joblib
import json

# Save the trained model in Colab
joblib.dump(final_model, '/content/loan_default_model.pkl')

# Save model info
model_info['feature_names'] = list(X_train.columns) if hasattr(X_train, 'columns'else None

with open('/content/model_info.json''w'as f:
    json.dump(model_info, f, indent=2)

print("✅ Model saved in Colab at /content/loan_default_model.pkl")
print("✅ Model info saved at /content/model_info.json")

# Verify files exist
import os
print(f"Model file exists: {os.path.exists('/content/loan_default_model.pkl')}")
print(f"Model info file exists: {os.path.exists('/content/model_info.json')}")
✅ Model saved in Colab at /content/loan_default_model.pkl
✅ Model info saved at /content/model_info.json
Model file exists: True
Model info file exists: True
Code Text

Gemini
# Install required packages
!pip install flask flask-ngrok pyngrok

# Import ngrok for public URL
from pyngrok import ngrok
import threading
import time
Requirement already satisfied: flask in /usr/local/lib/python3.12/dist-packages (3.1.1)
Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Requirement already satisfied: pyngrok in /usr/local/lib/python3.12/dist-packages (7.3.0)
Requirement already satisfied: blinker>=1.9.0 in /usr/local/lib/python3.12/dist-packages (from flask) (1.9.0)
Requirement already satisfied: click>=8.1.3 in /usr/local/lib/python3.12/dist-packages (from flask) (8.2.1)
Requirement already satisfied: itsdangerous>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from flask) (2.2.0)
Requirement already satisfied: jinja2>=3.1.2 in /usr/local/lib/python3.12/dist-packages (from flask) (3.1.6)
Requirement already satisfied: markupsafe>=2.1.1 in /usr/local/lib/python3.12/dist-packages (from flask) (3.0.2)
Requirement already satisfied: werkzeug>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from flask) (3.1.3)
Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from flask-ngrok) (2.32.4)
Requirement already satisfied: PyYAML>=5.1 in /usr/local/lib/python3.12/dist-packages (from pyngrok) (6.0.2)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->flask-ngrok) (3.4.3)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->flask-ngrok) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->flask-ngrok) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->flask-ngrok) (2025.8.3)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Code Text

Gemini
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.9/9.9 MB 59.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 524.0/524.0 kB 29.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.9/6.9 MB 71.1 MB/s eta 0:00:00
Code Text

Gemini

Code Text

Gemini
from flask import Flask, request, jsonify
import joblib
import pandas as pd
import numpy as np
import json
import os

app = Flask(__name__)

# Load model with correct Colab path
try:
    model = joblib.load('/content/loan_default_model.pkl')
    with open('/content/model_info.json''r'as f:
        model_info = json.load(f)
    print("✅ Model loaded successfully in Colab!")
except FileNotFoundError as e:
    print(f"❌ Model file not found: {e}")
    model = None
    model_info = None

@app.route('/')
def home():
    if model is None:
        return '<h1>❌ Model Not Found</h1><p>Please save your model first!</p>'
    
    return f'''
    <h1>🎯 Loan Default Predictor API</h1>
    <p>✅ Model loaded successfully in Google Colab!</p>
    <p><strong>Model Type:</strong> {model_info.get('model_type''Unknown')}</p>
    <p><strong>Test ROC-AUC:</strong> {model_info.get('test_roc_auc''N/A'):.3f}</p>
    <p><strong>Features:</strong> {model_info.get('features_shape''Unknown')}</p>
    
    <h3>📡 Available Endpoints:</h3>
    <ul>
        <li><strong>GET /</strong> - This page</li>
        <li><strong>GET /health</strong> - Health check</li>
        <li><strong>POST /predict</strong> - Make predictions</li>
    </ul>
    
    <h3>🧪 Test Prediction:</h3>
    <p>Send POST request to /predict with your features as JSON</p>
    '''

@app.route('/health')
def health():
    if model is None:
        return jsonify({'status''error''message''Model not loaded'}), 500
    
    return jsonify({
        'status''healthy',
        'platform''Google Colab',
        'model_type': model_info.get('model_type'),
        'test_roc_auc': model_info.get('test_roc_auc'),
        'features_count': model_info.get('features_shape')
    })

@app.route('/predict', methods=['POST'])
def predict():
    if model is None:
        return jsonify({'error''Model not loaded'}), 500
    
    try:
        data = request.get_json()
        if not data:
            return jsonify({'error''No data provided'}), 400
        
        # Convert to DataFrame
        df = pd.DataFrame([data])
        
        # Ensure correct feature order
        if model_info.get('feature_names'):
            df = df.reindex(columns=model_info['feature_names'], fill_value=0)
        
        # Make prediction
        prediction = model.predict(df)[0]
        probability = model.predict_proba(df)[0][1]
        
        # Risk assessment
        if probability > 0.7:
            risk_level = 'High Risk'
            recommendation = 'Loan application should be rejected'
        elif probability > 0.4:
            risk_level = 'Medium Risk'
            recommendation = 'Loan application needs additional review'
        else:
            risk_level = 'Low Risk'
            recommendation = 'Loan application can be approved'
        
        return jsonify({
            'prediction'int(prediction),
            'probability'float(probability),
            'probability_percent'f"{probability * 100:.1f}%",
            'risk_level': risk_level,
            'recommendation': recommendation,
            'timestamp'str(pd.Timestamp.now())
        })
    
    except Exception as e:
        return jsonify({'error'f'Prediction failed: {str(e)}'}), 400

# Function to run Flask in background
def run_flask():
    app.run(host='0.0.0.0', port=5000, debug=False, use_reloader=False)

print("Flask app created successfully!")
✅ Model loaded successfully in Colab!
Flask app created successfully!
Code Text

Gemini
# === STEP 1: Create Streamlit app that uses your existing data ===
%%writefile view_my_model.py
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import pickle
import joblib

# Set page config
st.set_page_config(
    page_title="Loan Defalt Risk Predictor App",
    page_icon="🏦",
    layout="wide"
)

st.title("🏦 My Loan Model Dashboard")
st.markdown("### Viewing Your Trained Model & Data")

# Load your data and model (you'll need to modify these paths)
@st.cache_data
def load_your_data():
    # MODIFY THESE PATHS TO MATCH YOUR ACTUAL VARIABLES
    # Option 1: If your data is in a DataFrame variable called 'df'
    try:
        # This will work if you run this after your existing code
        return globals().get('df'None)
    except:
        # Option 2: Load from file
        # return pd.read_csv('your_data.csv')
        # Option 3: Create sample data for demo
        np.random.seed(42)
        n_samples = 1000
        
        data = {
            'loanamount': np.random.uniform(100050000, n_samples),
            'termdays': np.random.choice([306090120180365], n_samples),
            'good_bad_flag': np.random.choice([01], n_samples, p=[0.70.3]),
            'bank_account_type': np.random.choice(['Savings''Current''Fixed'], n_samples),
            'employment_status_clients': np.random.choice(['Permanent''Temporary''Unemployed''Self-employed'], n_samples),
            'birth_year': np.random.randint(19602000, n_samples),
            'num_prev_loans': np.random.poisson(2, n_samples),
            'avg_repay_delay_days': np.random.exponential(5, n_samples),
            'age'2024 - np.random.randint(19602000, n_samples),
            'interest_curr_rate': np.random.uniform(0.050.25, n_samples),
            'repayment_curr_ratio': np.random.uniform(0.81.2, n_samples),
            'repayment_efficiency': np.random.uniform(0.61.0, n_samples),
            'late_payment_rate': np.random.exponential(0.1, n_samples),
        }
        return pd.DataFrame(data)

@st.cache_resource
def load_your_model():
    # MODIFY THIS TO LOAD YOUR ACTUAL MODEL
    try:
        # Option 1: If your model is in a variable
        return globals().get('model'None)
    except:
        # Option 2: Load from file
        # return joblib.load('your_model.pkl')
        # return pickle.load(open('your_model.pkl', 'rb'))
        return None

# Load data and model
df = load_your_data()
model = load_your_model()

if df is not None:
    # Basic info
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("Total Records"f"{len(df):,}")
    with col2:
        st.metric("Features"f"{df.shape[1]}")
    with col3:
        if 'good_bad_flag' in df.columns:
            default_rate = df['good_bad_flag'].mean() * 100
            st.metric("Default Rate"f"{default_rate:.1f}%")
        else:
            st.metric("Default Rate""N/A")
    with col4:
        if model is not None:
            st.metric("Model Status""✅ Loaded")
        else:
            st.metric("Model Status""❌ Not Found")

    # Display data sample
    st.subheader("📊 Your Data Sample")
    st.dataframe(df.head(10), use_container_width=True)

    # Data info
    st.subheader("📈 Data Info")
    col1, col2 = st.columns(2)
    
    with col1:
        st.write("**Column Types:**")
        dtype_info = pd.DataFrame({
            'Column': df.columns,
            'Type': df.dtypes,
            'Non-Null': df.count(),
            'Null Count': df.isnull().sum()
        })
        st.dataframe(dtype_info, use_container_width=True)
    
    with col2:
        st.write("**Numeric Summary:**")
        if len(df.select_dtypes(include=[np.number]).columns) > 0:
            st.dataframe(df.describe(), use_container_width=True)

    # Feature distributions
    st.subheader("📊 Feature Analysis")
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    if numeric_cols:
        selected_feature = st.selectbox("Select feature to analyze:", numeric_cols)
        
        col1, col2 = st.columns(2)
        
        with col1:
            # Distribution
            fig = px.histogram(df, x=selected_feature, title=f"Distribution of {selected_feature}")
            st.plotly_chart(fig, use_container_width=True)
        
        with col2:
            # Box plot by target if available
            if 'good_bad_flag' in df.columns:
                fig = px.box(df, x='good_bad_flag', y=selected_feature, 
                           title=f"{selected_feature} by Risk")
                st.plotly_chart(fig, use_container_width=True)
            else:
                # Just show basic stats
                stats = df[selected_feature].describe()
                st.write(f"**{selected_feature} Statistics:**")
                for stat, value in stats.items():
                    st.write(f"- {stat.title()}{value:.2f}")

    # Correlation analysis
    if len(numeric_cols) > 1:
        st.subheader("🔗 Feature Correlations")
        corr_matrix = df[numeric_cols].corr()
        
        fig = px.imshow(corr_matrix, 
                       title="Correlation Heatmap",
                       color_continuous_scale='RdBu')
        st.plotly_chart(fig, use_container_width=True)

    # Model predictions (if model is available)
    if model is not None:
        st.subheader("🤖 Model Predictions")
        
        try:
            # Try to make predictions
            X = df.select_dtypes(include=[np.number])
            
            if hasattr(model, 'predict'):
                predictions = model.predict(X)
                prediction_proba = None
                
                if hasattr(model, 'predict_proba'):
                    prediction_proba = model.predict_proba(X)
                
                # Add predictions to display
                display_df = df.copy()
                display_df['Prediction'] = predictions
                
                if prediction_proba is not None:
                    display_df['Probability'] = prediction_proba[:, 1if prediction_proba.shape[1] > 1 else prediction_proba.flatten()
                
                st.write("**Sample Predictions:**")
                st.dataframe(display_df[['Prediction'] + (['Probability'if 'Probability' in display_df.columns else [])].head(10))
                
                # Prediction distribution
                pred_counts = pd.Series(predictions).value_counts()
                fig = px.bar(x=pred_counts.index, y=pred_counts.values, 
                           title="Prediction Distribution")
                st.plotly_chart(fig, use_container_width=True)
                
            else:
                st.warning("Model doesn't have a predict method")
                
        except Exception as e:
            st.error(f"Error making predictions: {e}")
            st.info("This might be due to feature mismatch or model format")

    # Model performance (if target variable exists)
    if 'good_bad_flag' in df.columns and model is not None:
        st.subheader("📈 Model Performance")
        
        try:
            from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
            
            X = df.select_dtypes(include=[np.number])
            y_true = df['good_bad_flag']
            y_pred = model.predict(X)
            
            # Calculate metrics
            accuracy = accuracy_score(y_true, y_pred)
            precision = precision_score(y_true, y_pred, average='weighted')
            recall = recall_score(y_true, y_pred, average='weighted')
            f1 = f1_score(y_true, y_pred, average='weighted')
            
            # Display metrics
            col1, col2, col3, col4 = st.columns(4)
            with col1:
                st.metric("Accuracy"f"{accuracy:.3f}")
            with col2:
                st.metric("Precision"f"{precision:.3f}")
            with col3:
                st.metric("Recall"f"{recall:.3f}")
            with col4:
                st.metric("F1-Score"f"{f1:.3f}")
            
            # Confusion matrix
            cm = confusion_matrix(y_true, y_pred)
            fig = px.imshow(cm, text_auto=True, aspect="auto"
                           title="Confusion Matrix")
            st.plotly_chart(fig, use_container_width=True)
            
        except Exception as e:
            st.error(f"Error calculating performance metrics: {e}")

else:
    st.error("❌ Could not load your data. Please check the data loading section in the code.")

# Instructions for user
st.sidebar.header("📝 Instructions")
st.sidebar.markdown("""
**To use with your actual data:**

1. **Modify the `load_your_data()` function** to point to your DataFrame variable or file

2. **Modify the `load_your_model()` function** to load your trained model

3. **Common variable names to try:**
   - `df`, `data`, `loan_data`
   - `model`, `clf`, `classifier`

4. **File loading examples:**
   ```python
   # For CSV
   return pd.read_csv('your_file.csv')
   
   # For model
   return joblib.load('model.pkl')
   return pickle.load(open('model.pkl', 'rb'))
   ```
""")

# === STEP 2: Quick setup to run with your existing variables ===

# First, let's try to detect what variables you have available
print("🔍 Detecting your variables...")
print("Available DataFrames:")
for var_name in dir():
    var_obj = eval(var_name)
    if isinstance(var_obj, pd.DataFrame):
        print(f"  📊 {var_name}{var_obj.shape} - {list(var_obj.columns[:5])}...")

print("\nAvailable potential models:")
for var_name in dir():
    var_obj = eval(var_name)
    if hasattr(var_obj, 'predict'and hasattr(var_obj, 'fit'):
        print(f"  🤖 {var_name}{type(var_obj)}")

# === STEP 3: Simple inline viewer (works immediately) ===
print("\n" + "="*50)
print("📊 QUICK DATA OVERVIEW")
print("="*50)

# Try to find your data automatically
data_vars = []
model_vars = []

for var_name in dir():
    if not var_name.startswith('_'):
        try:
            var_obj = eval(var_name)
            if isinstance(var_obj, pd.DataFrame) and len(var_obj) > 0:
                data_vars.append((var_name, var_obj))
            elif hasattr(var_obj, 'predict'and hasattr(var_obj, 'fit'):
                model_vars.append((var_name, var_obj))
        except:
            pass

if data_vars:
    var_name, df_found = data_vars[0]  # Use first DataFrame found
    print(f"\n✅ Found DataFrame: {var_name}")
    print(f"Shape: {df_found.shape}")
    print(f"Columns: {list(df_found.columns)}")
    print(f"\nFirst 5 rows:")
    print(df_found.head())
    
    if 'good_bad_flag' in df_found.columns:
        default_rate = df_found['good_bad_flag'].mean() * 100
        print(f"\n📊 Default Rate: {default_rate:.1f}%")

if model_vars:
    model_name, model_found = model_vars[0]
    print(f"\n✅ Found Model: {model_name}")
    print(f"Type: {type(model_found)}")

print(f"\n🚀 To launch full dashboard, run the Streamlit app created above!")
Writing view_my_model.py
Code Text

Gemini
# === STEP 2: Detect your existing variables ===
print("🔍 Detecting your variables...")
print("Available DataFrames:")
for var_name in dir():
    if not var_name.startswith('_'):
        try:
            var_obj = globals()[var_name]
            if isinstance(var_obj, pd.DataFrame):
                print(f"  📊 {var_name}{var_obj.shape} - {list(var_obj.columns[:5])}...")
        except:
            pass

print("\nAvailable potential models:")
for var_name in dir():
    if not var_name.startswith('_'):
        try:
            var_obj = globals()[var_name]
            if hasattr(var_obj, 'predict'and hasattr(var_obj, 'fit'):
                print(f"  🤖 {var_name}{type(var_obj)}")
        except:
            pass

# Show quick preview of your data
data_found = False
for var_name in dir():
    if not var_name.startswith('_'):
        try:
            var_obj = globals()[var_name]
            if isinstance(var_obj, pd.DataFrame) and len(var_obj) > 10:
                print(f"\n✅ Found your DataFrame: '{var_name}'")
                print(f"Shape: {var_obj.shape}")
                print(f"Columns: {list(var_obj.columns)}")
                print(f"\nSample data:")
                print(var_obj.head(3))
                
                if 'good_bad_flag' in var_obj.columns:
                    default_rate = var_obj['good_bad_flag'].mean() * 100
                    print(f"\n📊 Default Rate: {default_rate:.1f}%")
                
                data_found = True
                break
        except:
            pass

if not data_found:
    print("\n❌ No DataFrame found. Make sure your data is loaded in a pandas DataFrame.")
🔍 Detecting your variables...
Available DataFrames:
  📊 X: (3269, 18) - ['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients', 'birth_year']...
  📊 X_test: (654, 18) - ['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients', 'birth_year']...
  📊 X_train: (2615, 18) - ['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients', 'birth_year']...
  📊 X_train_processed_df: (2615, 5153) - ['birth_year', 'num_prev_loans', 'avg_repay_delay_days', 'total_firstrepaid_late', 'avg_prev_repayment_ratio']...
  📊 df_demo: (4334, 8) - ['customerid', 'birthdate', 'bank_account_type', 'longitude_gps', 'latitude_gps']...
  📊 df_final: (3269, 24) - ['loanamount', 'termdays', 'good_bad_flag', 'approved_time', 'creation_time']...
  📊 df_main: (3269, 18) - ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate']...
  📊 df_perf: (4368, 11) - ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate']...
  📊 df_prevloans: (18183, 17) - ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate']...
  📊 df_prevloans_agg: (4359, 9) - ['customerid', 'num_prev_loans', 'avg_prev_loanamt', 'avg_repay_delay_days', 'total_firstrepaid_late']...
  📊 df_results: (1, 6) - ['Train Accuracy', 'Test Accuracy', 'Precision Score', 'Recall Score', 'F1 Score']...
  📊 heatmap: (6, 6) - ['sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount', 'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency']...
  📊 metrics_df: (6, 6) - ['Train Accuracy', 'Test Accuracy', 'Precision Score', 'Recall Score', 'F1 Score']...
  📊 missing_ids: (5, 18) - ['customerid', 'systemloanid', 'loannumber', 'approveddate', 'creationdate']...

Available potential models:
  🤖 CatBoostClassifier: <class 'type'>
  🤖 DecisionTreeClassifier: <class 'abc.ABCMeta'>
  🤖 GradientBoostingClassifier: <class 'abc.ABCMeta'>
  🤖 GridSearchCV: <class 'abc.ABCMeta'>
  🤖 LogisticRegression: <class 'type'>
  🤖 Pipeline: <class 'abc.ABCMeta'>
  🤖 RandomForestClassifier: <class 'abc.ABCMeta'>
  🤖 RandomizedSearchCV: <class 'abc.ABCMeta'>
  🤖 XGBClassifier: <class 'type'>
  🤖 best_gb_model: <class 'imblearn.pipeline.Pipeline'>
  🤖 final_model: <class 'imblearn.pipeline.Pipeline'>
  🤖 gb_pipeline: <class 'imblearn.pipeline.Pipeline'>
  🤖 gb_random: <class 'sklearn.model_selection._search.RandomizedSearchCV'>
  🤖 model: <class 'imblearn.pipeline.Pipeline'>
  🤖 pipeline: <class 'imblearn.pipeline.Pipeline'>

✅ Found your DataFrame: 'X'
Shape: (3269, 18)
Columns: ['approved_time', 'creation_time', 'bank_account_type', 'employment_status_clients', 'birth_year', 'num_prev_loans', 'avg_repay_delay_days', 'total_firstrepaid_late', 'avg_prev_repayment_ratio', 'avg_duration_days', 'age', 'interest_curr_rate', 'repayment_curr_ratio', 'sqrt_late_payment_rate', 'sqrt_termdays', 'sqrt_loanamount', 'sqrt_avg_prev_interest', 'sqrt_repayment_efficiency']

Sample data:
  approved_time creation_time bank_account_type employment_status_clients  \
0      08:22:56      07:22:47             Other                 Permanent   
1      17:04:41      16:04:18           Savings                 Permanent   
2      14:52:57      13:52:51             Other                 Permanent   

   birth_year  num_prev_loans  avg_repay_delay_days  total_firstrepaid_late  \
0        1972           11.00                 -0.91                    3.00   
1        1985            0.00                  0.00                    0.00   
2        1984            6.00                  0.83                    1.00   

   avg_prev_repayment_ratio  avg_duration_days  age  interest_curr_rate  \
0                      1.23              29.45   45                0.15   
1                      0.00               0.00   31                0.15   
2                      1.18              18.17   32                0.11   

   repayment_curr_ratio  sqrt_late_payment_rate  sqrt_termdays  \
0                  1.15                    0.52           5.48   
1                  1.15                    0.00           5.48   
2                  1.11                    0.41           3.87   

   sqrt_loanamount  sqrt_avg_prev_interest  sqrt_repayment_efficiency  
0           173.21                   62.45                       0.97  
1           122.47                    0.00                    1072.38  
2           141.42                   41.83                       0.97  
Code Text

Gemini
# === Create the Loan Default Risk Predictor App ===
dashboard_code = '''import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Set page config
st.set_page_config(
    page_title="Loan Default Risk Predictor",
    page_icon="⚠️",
    layout="wide"
)

# Header with branding
st.markdown("""
<div style='text-align: center; padding: 20px; background: linear-gradient(90deg, #FF6B6B, #4ECDC4); border-radius: 10px; margin-bottom: 20px;'>
    <h1 style='color: white; margin: 0;'>⚠️ Loan Default Risk Predictor App</h1>
    <p style='color: white; margin: 5px 0 0 0; font-size: 18px;'>Advanced ML-Powered Credit Risk Assessment System</p>
</div>
""", unsafe_allow_html=True)

# Load your actual variables
try:
    # Access your variables from the global environment
    exec("from __main__ import df_final, final_model, X, X_test, y_test", globals())
    df = df_final
    model = final_model
    data_loaded = True
    st.sidebar.success("✅ Data & Model Loaded Successfully")
except Exception as e:
    st.error(f"⚠️ Could not load your data variables: {e}")
    data_loaded = False
    st.sidebar.error("❌ Data Loading Failed")

if data_loaded:
    # Key Performance Indicators Dashboard
    st.markdown("### 📊 Risk Assessment Dashboard")
    
    col1, col2, col3, col4, col5 = st.columns(5)
    
    with col1:
        st.metric("📋 Total Loans", f"{len(df):,}")
    
    with col2:
        st.metric("🔢 Features", f"{df.shape[1]}")
    
    with col3:
        if 'good_bad_flag' in df.columns:
            default_rate = df['good_bad_flag'].mean() * 100
            st.metric("⚠️ Default Rate", f"{default_rate:.1f}%", 
                     delta=f"{default_rate-20:.1f}%" if default_rate > 20 else f"+{20-default_rate:.1f}%")
        else:
            st.metric("⚠️ Default Rate", "N/A")
    
    with col4:
        if 'loanamount' in df.columns:
            avg_loan = df['loanamount'].mean()
            st.metric("💰 Avg Loan", f"${avg_loan:,.0f}")
        else:
            st.metric("💰 Avg Loan", "N/A")
    
    with col5:
        st.metric("🤖 ML Model", "Active", delta="Trained")

    # Main Navigation Tabs
    tab1, tab2, tab3, tab4, tab5 = st.tabs([
        "🏠 Overview", 
        "📈 Risk Analytics", 
        "🎯 Predictive Insights", 
        "🤖 Model Performance",
        "⚡ Live Predictions"
    ])

    with tab1:
        st.markdown("## 🏠 Loan Portfolio Overview")
        
        col1, col2 = st.columns([2, 1])
        
        with col1:
            st.subheader("📋 Recent Loan Applications")
            display_df = df.head(15).copy()
            
            # Add risk labels for better understanding
            if 'good_bad_flag' in display_df.columns:
                display_df['Risk_Status'] = display_df['good_bad_flag'].map({0: '✅ Low Risk', 1: '⚠️ High Risk'})
            
            st.dataframe(display_df, use_container_width=True)
        
        with col2:
            st.subheader("📊 Portfolio Health")
            
            if 'good_bad_flag' in df.columns:
                # Risk distribution pie chart
                risk_counts = df['good_bad_flag'].value_counts()
                fig = px.pie(
                    values=risk_counts.values, 
                    names=['✅ Good Loans', '⚠️ Risky Loans'],
                    title="Loan Risk Distribution",
                    color_discrete_sequence=['#2E8B57', '#FF6347']
                )
                st.plotly_chart(fig, use_container_width=True)
                
                # Risk metrics
                total_loans = len(df)
                risky_loans = df['good_bad_flag'].sum()
                safe_loans = total_loans - risky_loans
                
                st.metric("🟢 Safe Loans", f"{safe_loans:,}", f"{(safe_loans/total_loans)*100:.1f}%")
                st.metric("🔴 Risky Loans", f"{risky_loans:,}", f"{(risky_loans/total_loans)*100:.1f}%")
        
        # Data Quality Assessment
        st.subheader("🔍 Data Quality Report")
        missing_data = df.isnull().sum()
        if missing_data.sum() > 0:
            st.warning("⚠️ Missing data detected in some fields")
            missing_df = missing_data[missing_data > 0].sort_values(ascending=False)
            fig = px.bar(x=missing_df.values, y=missing_df.index, orientation='h',
                        title="Missing Values by Feature", color_discrete_sequence=['#FFA500'])
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.success("✅ Data quality excellent - no missing values detected!")

    with tab2:
        st.markdown("## 📈 Risk Analytics Dashboard")
        
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        
        if numeric_cols:
            col1, col2 = st.columns([1, 3])
            
            with col1:
                st.subheader("🎛️ Analytics Controls")
                selected_feature = st.selectbox("📊 Select Risk Factor:", numeric_cols)
                
                # Feature statistics
                if selected_feature in df.columns:
                    feature_stats = df[selected_feature].describe()
                    st.markdown("**📈 Quick Stats:**")
                    st.write(f"**Mean:** {feature_stats['mean']:.2f}")
                    st.write(f"**Std:** {feature_stats['std']:.2f}")
                    st.write(f"**Min:** {feature_stats['min']:.2f}")
                    st.write(f"**Max:** {feature_stats['max']:.2f}")
            
            with col2:
                # Feature distribution
                fig_dist = px.histogram(df, x=selected_feature, nbins=30,
                                      title=f"📊 Distribution of {selected_feature}",
                                      color_discrete_sequence=['#4ECDC4'])
                st.plotly_chart(fig_dist, use_container_width=True)
                
                # Risk comparison
                if 'good_bad_flag' in df.columns:
                    fig_box = px.box(df, x='good_bad_flag', y=selected_feature,
                                   title=f"🎯 {selected_feature} by Risk Category",
                                   color='good_bad_flag',
                                   color_discrete_sequence=['#2E8B57', '#FF6347'])
                    fig_box.update_xaxes(tickvals=[0, 1], ticktext=['✅ Good Risk', '⚠️ Bad Risk'])
                    st.plotly_chart(fig_box, use_container_width=True)
        
        # Risk Factor Correlation Analysis
        if 'good_bad_flag' in df.columns and len(numeric_cols) > 1:
            st.subheader("🎯 Risk Factor Impact Analysis")
            correlations = df[numeric_cols].corrwith(df['good_bad_flag']).abs().sort_values(ascending=True)
            
            fig = px.bar(x=correlations.values, y=correlations.index,
                       orientation='h', title="🔥 Feature Impact on Default Risk",
                       color=correlations.values, color_continuous_scale='Reds')
            st.plotly_chart(fig, use_container_width=True)

    with tab3:
        st.markdown("## 🎯 Predictive Risk Insights")
        
        if 'good_bad_flag' in df.columns:
            categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
            categorical_features = [col for col in categorical_cols if df[col].nunique() < 20]
            
            if categorical_features:
                st.subheader("🔍 Risk Analysis by Categories")
                selected_cat = st.selectbox("📋 Analyze risk by:", categorical_features)
                
                # Risk analysis
                risk_data = df.groupby(selected_cat)['good_bad_flag'].agg(['count', 'mean']).reset_index()
                risk_data.columns = [selected_cat, 'Total_Loans', 'Default_Rate']
                risk_data['Default_Rate_Pct'] = risk_data['Default_Rate'] * 100
                risk_data = risk_data.sort_values('Default_Rate_Pct', ascending=False)
                
                # Enhanced visualization
                fig = make_subplots(specs=[[{"secondary_y": True}]])
                
                fig.add_trace(
                    go.Bar(x=risk_data[selected_cat], y=risk_data['Total_Loans'],
                           name="📊 Number of Loans", marker_color='lightblue',
                           opacity=0.7),
                    secondary_y=False,
                )
                
                fig.add_trace(
                    go.Scatter(x=risk_data[selected_cat], y=risk_data['Default_Rate_Pct'],
                              mode='lines+markers+text', name="⚠️ Default Rate (%)",
                              line=dict(color='red', width=4),
                              marker=dict(size=10),
                              textposition="top center"),
                    secondary_y=True,
                )
                
                fig.update_xaxes(title_text=f"📋 {selected_cat}")
                fig.update_yaxes(title_text="📊 Number of Loans", secondary_y=False)
                fig.update_yaxes(title_text="⚠️ Default Rate (%)", secondary_y=True)
                fig.update_layout(title=f"🎯 Risk Analysis by {selected_cat}", height=500)
                
                st.plotly_chart(fig, use_container_width=True)
                
                # Risk ranking table
                st.subheader("📊 Risk Ranking by Category")
                risk_data_display = risk_data.copy()
                risk_data_display['Risk_Level'] = pd.cut(risk_data_display['Default_Rate_Pct'], 
                                                        bins=[0, 15, 30, 100], 
                                                        labels=['🟢 Low', '🟡 Medium', '🔴 High'])
                st.dataframe(risk_data_display, use_container_width=True)

    with tab4:
        st.markdown("## 🤖 ML Model Performance Dashboard")
        
        try:
            # Model predictions
            y_pred = model.predict(X_test)
            
            # Performance metrics
            from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
            
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            
            # Try to get probability predictions for AUC
            try:
                y_proba = model.predict_proba(X_test)[:, 1]
                auc_score = roc_auc_score(y_test, y_proba)
            except:
                auc_score = None
            
            # Performance metrics display
            st.subheader("📈 Model Performance Metrics")
            col1, col2, col3, col4, col5 = st.columns(5)
            
            with col1:
                st.metric("🎯 Accuracy", f"{accuracy:.1%}", 
                         delta=f"+{(accuracy-0.8)*100:.1f}%" if accuracy > 0.8 else f"{(accuracy-0.8)*100:.1f}%")
            with col2:
                st.metric("🎯 Precision", f"{precision:.1%}")
            with col3:
                st.metric("🎯 Recall", f"{recall:.1%}")
            with col4:
                st.metric("🎯 F1-Score", f"{f1:.1%}")
            with col5:
                if auc_score:
                    st.metric("🎯 AUC Score", f"{auc_score:.1%}")
                else:
                    st.metric("🎯 AUC Score", "N/A")
            
            col1, col2 = st.columns(2)
            
            with col1:
                # Confusion Matrix
                cm = confusion_matrix(y_test, y_pred)
                fig = px.imshow(cm, text_auto=True, aspect="auto",
                               title="🎯 Confusion Matrix",
                               labels=dict(x="Predicted", y="Actual"),
                               color_continuous_scale='Blues')
                fig.update_xaxes(tickvals=[0, 1], ticktext=['Good', 'Bad'])
                fig.update_yaxes(tickvals=[0, 1], ticktext=['Good', 'Bad'])
                st.plotly_chart(fig, use_container_width=True)
            
            with col2:
                # Prediction distribution
                pred_counts = pd.Series(y_pred).value_counts()
                fig = px.pie(values=pred_counts.values, 
                           names=['✅ Predicted Good', '⚠️ Predicted Bad'],
                           title="🔮 Model Predictions Distribution",
                           color_discrete_sequence=['#2E8B57', '#FF6347'])
                st.plotly_chart(fig, use_container_width=True)
            
            # Model insights
            st.subheader("🔍 Model Insights")
            total_test = len(y_test)
            correct_predictions = (y_pred == y_test).sum()
            
            st.info(f"""
            **🤖 Model Summary:**
            - Tested on {total_test:,} loan applications
            - Correctly identified {correct_predictions:,} cases ({accuracy:.1%})
            - Model Type: {type(model).__name__}
            """)
            
        except Exception as e:
            st.error(f"⚠️ Error in model evaluation: {e}")
            st.info("💡 Make sure your test data (X_test, y_test) is available")

    with tab5:
        st.markdown("## ⚡ Live Risk Prediction")
        
        st.info("🚀 **Coming Soon**: Interactive loan risk calculator where you can input loan parameters and get instant risk predictions!")
        
        # Placeholder for live prediction interface
        st.subheader("🎛️ Loan Risk Calculator")
        
        col1, col2 = st.columns(2)
        
        with col1:
            loan_amount = st.number_input("💰 Loan Amount ($)", min_value=1000, max_value=100000, value=25000)
            term_days = st.selectbox("📅 Loan Term (Days)", [30, 60, 90, 120, 180, 365])
            employment_status = st.selectbox("💼 Employment Status", ['Permanent', 'Temporary', 'Self-employed', 'Unemployed'])
        
        with col2:
            age = st.slider("👤 Applicant Age", 18, 80, 35)
            account_type = st.selectbox("🏦 Account Type", ['Savings', 'Current', 'Other'])
            prev_loans = st.number_input("📋 Previous Loans", min_value=0, max_value=20, value=2)
        
        if st.button("🔮 Predict Default Risk", type="primary"):
            st.warning("⚡ Live prediction feature will be implemented with your specific model requirements!")
            st.balloons()

else:
    st.error("❌ Could not load your data. Make sure df_final and final_model are available.")
    st.markdown("""
    ### 🔧 Troubleshooting:
    1. Ensure your variables `df_final` and `final_model` are loaded
    2. Run this dashboard in the same environment as your model training
    3. Check that all required libraries are installed
    """)

# Sidebar with app info
st.sidebar.markdown("""
---
### ⚠️ Loan Default Risk Predictor

**🎯 Features:**
- Advanced ML risk assessment
- Real-time portfolio monitoring
- Interactive risk analytics
- Performance dashboards

**📊 Your Data:**
- Dataset: Loan applications
- Model: Gradient Boosting Pipeline
- Features: 24+ risk factors
""")

st.sidebar.markdown("---")
st.sidebar.markdown("🏦 **Built with:** Streamlit • Plotly • Scikit-learn")

# Footer
st.markdown("---")
st.markdown("""
<div style='text-align: center; color: #666; padding: 20px;'>
    ⚠️ <strong>Loan Default Risk Predictor App</strong> | Powered by Machine Learning<br>
    <small>Advanced Credit Risk Assessment • Real-time Analytics • Predictive Insights</small>
</div>
""", unsafe_allow_html=True)
'''

# Write the file
with open('loan_default_risk_predictor.py''w'as f:
    f.write(dashboard_code)

print("✅ Loan Default Risk Predictor App created successfully!")
print("📱 App Name: Loan Default Risk Predictor")
print("🎯 Features: Advanced ML-powered credit risk assessment")

# Now run it
import subprocess
import threading
import time

def run_streamlit():
    subprocess.run(["streamlit""run""loan_default_risk_predictor.py""--server.port""8501""--server.address""0.0.0.0"])

# Kill existing processes
!pkill -f streamlit

# Start streamlit
thread = threading.Thread(target=run_streamlit)
thread.daemon = True  
thread.start()

print("🚀 Starting your Loan Default Risk Predictor App...")
time.sleep(5)
print("✅ App should be running on port 8501")
print("💡 Use Colab's port forwarding to access your professional risk assessment app!")
✅ Loan Default Risk Predictor App created successfully!
📱 App Name: Loan Default Risk Predictor
🎯 Features: Advanced ML-powered credit risk assessment
🚀 Starting your Loan Default Risk Predictor App...
✅ App should be running on port 8501
💡 Use Colab's port forwarding to access your professional risk assessment app!
Code Text

Gemini
# Install and import required packages
!pip install streamlit-webrtc

# Use Colab's port forwarding
import subprocess
from google.colab import output
import time

# Start Streamlit
process = subprocess.Popen([
    "streamlit""run""your_app_file.py",
    "--server.port""8501"
    "--server.headless""true"
])

print("🚀 Streamlit is starting...")
time.sleep(10)

# Display the local URL - Colab should automatically detect it
print("🌐 Your app should be running at:")
print("📱 Check for a popup or notification from Colab about port forwarding")
print("🔗 Or try: https://colab.research.google.com/drive/your-notebook#scrollTo=your-cell")
Collecting streamlit-webrtc
  Downloading streamlit_webrtc-0.63.4-py3-none-any.whl.metadata (18 kB)
Collecting aioice>=0.10.1 (from streamlit-webrtc)
  Downloading aioice-0.10.1-py3-none-any.whl.metadata (4.1 kB)
Collecting aiortc>=1.11.0 (from streamlit-webrtc)
  Downloading aiortc-1.13.0-py3-none-any.whl.metadata (4.9 kB)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from streamlit-webrtc) (25.0)
Requirement already satisfied: streamlit>=0.89.0 in /usr/local/lib/python3.12/dist-packages (from streamlit-webrtc) (1.48.1)
Collecting dnspython>=2.0.0 (from aioice>=0.10.1->streamlit-webrtc)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting ifaddr>=0.2.0 (from aioice>=0.10.1->streamlit-webrtc)
  Downloading ifaddr-0.2.0-py3-none-any.whl.metadata (4.9 kB)
Collecting av<15.0.0,>=14.0.0 (from aiortc>=1.11.0->streamlit-webrtc)
  Downloading av-14.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Requirement already satisfied: cffi>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from aiortc>=1.11.0->streamlit-webrtc) (1.17.1)
Collecting cryptography>=44.0.0 (from aiortc>=1.11.0->streamlit-webrtc)
  Downloading cryptography-45.0.6-cp311-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB)
Requirement already satisfied: google-crc32c>=1.1 in /usr/local/lib/python3.12/dist-packages (from aiortc>=1.11.0->streamlit-webrtc) (1.7.1)
Collecting pyee>=13.0.0 (from aiortc>=1.11.0->streamlit-webrtc)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting pylibsrtp>=0.10.0 (from aiortc>=1.11.0->streamlit-webrtc)
  Downloading pylibsrtp-0.12.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting pyopenssl>=25.0.0 (from aiortc>=1.11.0->streamlit-webrtc)
  Downloading pyopenssl-25.1.0-py3-none-any.whl.metadata (17 kB)
Requirement already satisfied: altair!=5.4.0,!=5.4.1,<6,>=4.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (5.5.0)
Requirement already satisfied: blinker<2,>=1.5.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (1.9.0)
Requirement already satisfied: cachetools<7,>=4.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (5.5.2)
Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (8.2.1)
Requirement already satisfied: numpy<3,>=1.23 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (2.0.2)
Requirement already satisfied: pandas<3,>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (2.2.2)
Requirement already satisfied: pillow<12,>=7.1.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (11.3.0)
Requirement already satisfied: protobuf<7,>=3.20 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (5.29.5)
Requirement already satisfied: pyarrow>=7.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (18.1.0)
Requirement already satisfied: requests<3,>=2.27 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (2.32.4)
Requirement already satisfied: tenacity<10,>=8.1.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (8.5.0)
Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (0.10.2)
Requirement already satisfied: typing-extensions<5,>=4.4.0 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (4.14.1)
Requirement already satisfied: watchdog<7,>=2.1.5 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (6.0.0)
Requirement already satisfied: gitpython!=3.1.19,<4,>=3.0.7 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (3.1.45)
Requirement already satisfied: pydeck<1,>=0.8.0b4 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (0.9.1)
Requirement already satisfied: tornado!=6.5.0,<7,>=6.0.3 in /usr/local/lib/python3.12/dist-packages (from streamlit>=0.89.0->streamlit-webrtc) (6.4.2)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (3.1.6)
Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.12/dist-packages (from altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (4.25.1)
Requirement already satisfied: narwhals>=1.14.2 in /usr/local/lib/python3.12/dist-packages (from altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (2.1.2)
Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.0.0->aiortc>=1.11.0->streamlit-webrtc) (2.22)
Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.12/dist-packages (from gitpython!=3.1.19,<4,>=3.0.7->streamlit>=0.89.0->streamlit-webrtc) (4.0.12)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas<3,>=1.4.0->streamlit>=0.89.0->streamlit-webrtc) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas<3,>=1.4.0->streamlit>=0.89.0->streamlit-webrtc) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas<3,>=1.4.0->streamlit>=0.89.0->streamlit-webrtc) (2025.2)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests<3,>=2.27->streamlit>=0.89.0->streamlit-webrtc) (3.4.3)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests<3,>=2.27->streamlit>=0.89.0->streamlit-webrtc) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests<3,>=2.27->streamlit>=0.89.0->streamlit-webrtc) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests<3,>=2.27->streamlit>=0.89.0->streamlit-webrtc) (2025.8.3)
Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.12/dist-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit>=0.89.0->streamlit-webrtc) (5.0.2)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (3.0.2)
Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=3.0->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (25.3.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=3.0->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (2025.4.1)
Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=3.0->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (0.36.2)
Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=3.0->altair!=5.4.0,!=5.4.1,<6,>=4.0->streamlit>=0.89.0->streamlit-webrtc) (0.27.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas<3,>=1.4.0->streamlit>=0.89.0->streamlit-webrtc) (1.17.0)
Downloading streamlit_webrtc-0.63.4-py3-none-any.whl (216 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 216.9/216.9 kB 5.7 MB/s eta 0:00:00
Downloading aioice-0.10.1-py3-none-any.whl (24 kB)
Downloading aiortc-1.13.0-py3-none-any.whl (92 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 92.9/92.9 kB 8.9 MB/s eta 0:00:00
Downloading av-14.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.5/35.5 MB 35.1 MB/s eta 0:00:00
Downloading cryptography-45.0.6-cp311-abi3-manylinux_2_34_x86_64.whl (4.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.5/4.5 MB 103.4 MB/s eta 0:00:00
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 313.6/313.6 kB 23.1 MB/s eta 0:00:00
Downloading ifaddr-0.2.0-py3-none-any.whl (12 kB)
Downloading pyee-13.0.0-py3-none-any.whl (15 kB)
Downloading pylibsrtp-0.12.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.2/2.2 MB 78.5 MB/s eta 0:00:00
Downloading pyopenssl-25.1.0-py3-none-any.whl (56 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.8/56.8 kB 4.2 MB/s eta 0:00:00
Installing collected packages: ifaddr, pyee, dnspython, av, pylibsrtp, cryptography, aioice, pyopenssl, aiortc, streamlit-webrtc
  Attempting uninstall: cryptography
    Found existing installation: cryptography 43.0.3
    Uninstalling cryptography-43.0.3:
      Successfully uninstalled cryptography-43.0.3
  Attempting uninstall: pyopenssl
    Found existing installation: pyOpenSSL 24.2.1
    Uninstalling pyOpenSSL-24.2.1:
      Successfully uninstalled pyOpenSSL-24.2.1
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pydrive2 1.21.3 requires cryptography<44, but you have cryptography 45.0.6 which is incompatible.
pydrive2 1.21.3 requires pyOpenSSL<=24.2.1,>=19.1.0, but you have pyopenssl 25.1.0 which is incompatible.
Successfully installed aioice-0.10.1 aiortc-1.13.0 av-14.4.0 cryptography-45.0.6 dnspython-2.7.0 ifaddr-0.2.0 pyee-13.0.0 pylibsrtp-0.12.0 pyopenssl-25.1.0 streamlit-webrtc-0.63.4
🚀 Streamlit is starting...
🌐 Your app should be running at:
📱 Check for a popup or notification from Colab about port forwarding
🔗 Or try: https://colab.research.google.com/drive/your-notebook#scrollTo=your-cell
Code Text

Gemini
Code Text

Gemini
# Kill all streamlit processes and free up the port
!pkill -f streamlit
!pkill -f "port 8501"
!fuser -k 8501/tcp  # Force kill anything on port 8501

import time
time.sleep(3)
print("✅ Port 8501 should now be free")
✅ Port 8501 should now be free
Code Text

Gemini
# Create requirements.txt
requirements = """
streamlit
pandas
numpy
scikit-learn
plotly
seaborn
matplotlib
"""

with open('requirements.txt''w'as f:
    f.write(requirements.strip())

print("✅ requirements.txt created")
print("📄 Contents:")
print(requirements)
✅ requirements.txt created
📄 Contents:

streamlit
pandas
numpy
scikit-learn
plotly
seaborn
matplotlib

Code Text

Gemini
📄 view_my_model.py
📄 requirements.txt
📄 loan_default_risk_predictor.py

🔽 You'll need to download these files: ['view_my_model.py', 'requirements.txt', 'loan_default_risk_predictor.py']
Code Text

Gemini

Variables Terminal
Add a comment
Could not connect to the reCAPTCHA service. Please check your internet connection and reload to get a reCAPTCHA challenge.